# 2.4 Encoding Categorical Variable

## Setting Environment

In [1]:
# modules we'll use
import pandas as pd
import numpy as np

## Importing Data

In [2]:
%%time
info_content_df = pd.read_csv('../data/interim/Info_Content.csv', index_col='ucid')
info_userdata_df = pd.read_csv('../data/interim/Info_UserData.csv', index_col='uuid')
log_problem_df = pd.read_csv('../data/interim/Log_Problem.csv', index_col='upid')

Wall time: 16.7 s


In [3]:
s = info_content_df.dtypes == 'object'
object_cols_info_content = list(s[s].index)

print("Categorical variables: ")
print(object_cols_info_content)

Categorical variables: 
['content_pretty_name', 'difficulty', 'learning_stage']


In [4]:
s = info_userdata_df.dtypes == 'object'
object_cols_info_userdata = list(s[s].index)

print("Categorical variables: ")
print(object_cols_info_userdata)

Categorical variables: 
['gender', 'date_login', 'user_city']


In [5]:
s = log_problem_df.dtypes == 'object'
object_cols_log_problem = list(s[s].index)

print("Categorical variables: ")
print(object_cols_log_problem)

Categorical variables: 
['datetime_attempt', 'uuid', 'ucid']


### Label Encoding

**Mapping ordinal features**

To make sure that the learning algorithm interprets the ordinal features correctly, we need to convert the categorical string values into integers. Unfortunately, there is no convenient function that can automatically derive the correct order of the labels of our size feature, so we have to define the mapping manually. In the following simple example, let's assume that we know the numerical difference between features, for example, hard = normal + 1 = easy + 2 = unset + 3.

In [6]:
difficulty_mapping = {
    'easy': 1,
    'normal': 2,
    'hard': 3,
    'unset': 0
}
learning_stage_mapping = {
    'elementary': 1,
    'junior': 2,
    'senior': 3
}

In [7]:
info_content_df['difficulty'] = info_content_df['difficulty'].map(difficulty_mapping)
info_content_df['learning_stage'] = info_content_df['learning_stage'].map(learning_stage_mapping)

In [8]:
info_content_df.head()

Unnamed: 0_level_0,content_pretty_name,difficulty,learning_stage
ucid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
odIwFdIiecFwVUAEEV40K3MSuCSlIZkbq92Zp9tkZq8=,【基礎】怎樣解題：數量關係,1,1
dfeeBaa8zDhWS6nu7zeXKwLyi4zqEajI3tJM9/fSBPM=,【基礎】和差問題 1,1,1
C2AT0OBTUn+PRxEVd39enhW/DJtka1Tk90DUAR6yVdA=,【基礎】雞兔問題 1,1,1
jZvYpEa6VB/WrlKKmQHnfbv/xJ4OypBzq0epVcn500Q=,【基礎】年齡問題 1,1,1
M+UxJPgRIW57a0YS3eik8A9YDj+AwaMpTa5yWYn/kAw=,【基礎】追趕問題,1,1


In [9]:
info_content_df.describe()

Unnamed: 0,difficulty,learning_stage
count,1330.0,1330.0
mean,1.422556,1.412782
std,0.72809,0.497082
min,0.0,1.0
25%,1.0,1.0
50%,1.0,1.0
75%,2.0,2.0
max,3.0,3.0


### One-hot encoding - Nominal

By including dummy variable in a regression model however, one should be careful of the Dummy Variable Trap. The Dummy Variable trap is a scenario in which the independent variables are multicollinear - a scenario in which two or more variables are highly correlated; in simple terms one variable can be predicted from the others.

In [10]:
is_downgrade_mapping = {
    True: 1,
    False: 0
}
is_upgrade_mapping = {
    True: 1,
    False: 0
}

In [11]:
log_problem_df['is_downgrade'] = log_problem_df['is_downgrade'].map(is_downgrade_mapping)
log_problem_df['is_upgrade'] = log_problem_df['is_upgrade'].map(is_upgrade_mapping)

In [12]:
log_problem_df.head()

Unnamed: 0_level_0,datetime_attempt,uuid,ucid,problem_number,exercise_problem_repeat_session,is_correct,total_sec_taken,total_attempt_cnt,used_hint_cnt,is_hint_used,is_downgrade,is_upgrade,level
upid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ZmKEZ0F2WFqhlL7KFfJcHEnZCZu0e4p+CVG5rSlyKYk=,2018-09-28 20:00:00+00:00,Kpq2q+eKw/O+6/jLs3XJosgmI7weEJxJZdnkKTbbF8I=,Ps4dfShfpeMF3VG030HqZ2bsbD7PaVxvJYFTtroeSzQ=,2,1,True,11,1,0,False,0,0,0
tO9dyvadKWMVQgEx/BXtRIYJ2TRJFQgwvcsBwFb4+xI=,2018-09-28 10:15:00+00:00,0+VU/Zb0Q96uoByuRhl7r9bJuJO6CKWpsmNMEuijSzc=,/d39FzqaM3PZzpoMXxA80PMICsVhzfL6MGSCqZtsQOo=,6,1,True,26,1,0,False,0,0,0
6Lxz6aXvgyw3vZd3v8g6jgoCRDPOQzVPx/dnEC0o7DQ=,2018-09-05 20:00:00+00:00,g8DnYvIqpolw10XlwWeIWv6NbDPByUbmgH8EshJqBns=,YuGOmB+frbM8rfAa0RJE882R+IoMf9N89OiVqLbAHBw=,4,1,True,78,1,0,False,0,0,0
1fIjdakTApQp5PfWog87uOmM6JuoNE/oQq2y5/fMmfw=,2018-09-14 16:30:00+00:00,kSyUTFlepsYUD723IPL/jEZ520xaKbscrBmNtBUFR1o=,BG1RsWojzEHzV28RBm/1iKi1NyZgDcDomLYEJSV6lmo=,3,1,True,7,1,0,False,0,0,0
8V/NT6M+er2I3V3ZIWRNo4Qbo3Iad89PHbeeZeoZeF0=,2018-09-13 16:00:00+00:00,XMFbFA7C49+LRhUddhelfPpA6F5dbOoxeyL3eYbuTlY=,qPHR8aBqOhKij9IS/Y8IR8prwWruoDBGU1tVUhXDJkE=,12,1,True,48,1,0,False,0,1,1


In [13]:
info_userdata_gender = pd.get_dummies(info_userdata_df.gender, prefix='gender').iloc[:,:-1]
info_userdata_df = pd.concat([info_userdata_df, info_userdata_gender], axis=1, sort=False)

In [14]:
info_userdata_city = pd.get_dummies(info_userdata_df.user_city, prefix='user_city').iloc[:,:-1]
info_userdata_df = pd.concat([info_userdata_df, info_userdata_city], axis=1, sort=False)

In [15]:
info_userdata_df = info_userdata_df.drop(columns = ['gender','user_city'])

In [16]:
info_userdata_df.head()

Unnamed: 0_level_0,points,badges_cnt,date_login,user_grade,has_teacher_cnt,is_self_coach,has_student_cnt,belongs_to_class_cnt,has_class_cnt,gender_female,...,user_city_ml,user_city_ntct,user_city_ntpc,user_city_phc,user_city_ptc,user_city_tc,user_city_tn,user_city_tp,user_city_ttct,user_city_ty
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Y2RcCdmUJAYPUAIDElo4nE9KrkLLFzUIRdexG+ipaZQ=,18300,1,2019-01-24,1,0,False,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
lw/Rchfvl9q1UDaQRmeE6QJDJeXAK7nt56RvUvqxD/8=,6468,0,2019-01-24,1,1,False,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
ncVYyCw3osV77X9M+4NbI7LvBR5UiB4ix6Ca+baQArA=,4703,0,2019-01-24,1,0,False,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
J7Tbo1x2WtRpPuXeX7lWT9tkzWlSJeubl8UWjNmHh+4=,15525,1,2019-01-24,2,0,False,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
qijKzROzz1LmCaCxHJ3mOBOtjW/q4kW80tnpPmXHVYQ=,7945,0,2019-01-24,2,1,False,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
info_userdata_df.describe()

Unnamed: 0,points,badges_cnt,user_grade,has_teacher_cnt,has_student_cnt,belongs_to_class_cnt,has_class_cnt,gender_female,gender_male,user_city_chc,...,user_city_ml,user_city_ntct,user_city_ntpc,user_city_phc,user_city_ptc,user_city_tc,user_city_tn,user_city_tp,user_city_ttct,user_city_ty
count,72758.0,72758.0,72758.0,72758.0,72758.0,72758.0,72758.0,72758.0,72758.0,72758.0,...,72758.0,72758.0,72758.0,72758.0,72758.0,72758.0,72758.0,72758.0,72758.0,72758.0
mean,63047.59,9.543789,5.620715,0.88481,0.203304,0.873306,0.046442,0.180035,0.176695,0.031186,...,0.022843,0.024176,0.148547,0.006611,0.028368,0.1472,0.05543,0.17172,0.019723,0.104662
std,124204.2,19.036328,2.041178,1.552795,6.413944,1.961188,0.31506,0.384219,0.381413,0.17382,...,0.149403,0.153596,0.355644,0.081039,0.166023,0.354308,0.22882,0.377139,0.139047,0.30612
min,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6285.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,20400.0,3.0,6.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,65313.75,10.0,7.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,4047528.0,760.0,12.0,106.0,966.0,120.0,24.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Exporting Data

The data is in DataFrame form, it can manipulate it by adding a body mass index column. The transformed data is exported to the interim data folder which stores the intermediate data that has been transformed.

In [18]:
info_content_df.to_csv('../data/interim/Info_Content.csv')
info_userdata_df.to_csv('../data/interim/Info_UserData.csv')
log_problem_df.to_csv('../data/interim/Log_Problem.csv')