In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
import seaborn as sns

In [2]:
# define category ordering for ordinal variables
cat_Medu = pd.CategoricalDtype(categories = ['1', '2', '3', '4'], ordered = True)
cat_Fedu = pd.CategoricalDtype(categories = ['1', '2', '3', '4'], ordered = True)
cat_traveltime = pd.CategoricalDtype(categories = ['1', '2', '3', '4'], ordered = True)
cat_studytime = pd.CategoricalDtype(categories = ['1', '2', '3', '4'], ordered = True)
cat_failures = pd.CategoricalDtype(categories = ['1', '2', '3', '4'], ordered = True)

# define variable types
stu_dtypes = {
    'school' : 'category',
    'sex' : 'category',
    'age' : 'int8',
    'address' : 'category',
    'famsize' : 'category',
    'Pstatus' : 'category',
    'Medu' : 'category',
    'Fedu' : 'category',
    'Mjob' : 'category',
    'Fjob' : 'category',
    'reason' : 'category',
    'guardian' : 'category',
    'traveltime' : 'category',
    'studytime' : 'category',
    'failures' : 'category',
    'schoolsup' : 'category',
    'famsup' : 'category',
    'paid' : 'category',
    'activities' : 'category',
    'nursery' : 'category',
    'higher' : 'category',
    'internet' : 'category',
    'romantic' : 'category',
    'famrel' : 'int8',
    'freetime' : 'int8',
    'goout' : 'int8',
    'Dalc' : 'int8',
    'Walc' : 'int8',
    'health' : 'int8',
    'absences' : 'int8',
    'G1' : 'int8',
    'G2' : 'int8',
    'G3' : 'int8'
}

# read in student math performance data set
student = pd.read_csv('data/student/student-por.csv', sep = ';', dtype = stu_dtypes)
# use G3 as target variable - drop G1 and G2
student = student.drop(['G1', 'G2'], axis = 1)

In [3]:
student.isnull().sum()

school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G3            0
dtype: int64

In [4]:
student.describe()

Unnamed: 0,age,famrel,freetime,goout,Dalc,Walc,health,absences,G3
count,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0
mean,16.744222,3.930663,3.180277,3.1849,1.502311,2.280431,3.53621,3.659476,11.906009
std,1.218138,0.955717,1.051093,1.175766,0.924834,1.28438,1.446259,4.640759,3.230656
min,15.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
25%,16.0,4.0,3.0,2.0,1.0,1.0,2.0,0.0,10.0
50%,17.0,4.0,3.0,3.0,1.0,2.0,4.0,2.0,12.0
75%,18.0,5.0,4.0,4.0,2.0,3.0,5.0,6.0,14.0
max,22.0,5.0,5.0,5.0,5.0,5.0,5.0,32.0,19.0


In [5]:
student.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,no,no,4,3,4,1,1,3,4,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,no,5,3,3,1,1,3,2,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,no,4,3,2,2,3,3,6,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,3,2,2,1,1,5,0,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,no,no,4,3,2,1,2,5,0,13


In [8]:
#Create initial x and y datasets of interest
x = student.drop(['G3'], axis = 1)
y = student[['G3']]

#identify categorical variables that may need changed to numerica / grouped
categorical = x.select_dtypes(include='category')
cat_columns = categorical.columns
for col in cat_columns:
    print(student[col].value_counts())

GP    423
MS    226
Name: school, dtype: int64
F    383
M    266
Name: sex, dtype: int64
U    452
R    197
Name: address, dtype: int64
GT3    457
LE3    192
Name: famsize, dtype: int64
T    569
A     80
Name: Pstatus, dtype: int64
2    186
4    175
1    143
3    139
0      6
Name: Medu, dtype: int64
2    209
1    174
3    131
4    128
0      7
Name: Fedu, dtype: int64
other       258
services    136
at_home     135
teacher      72
health       48
Name: Mjob, dtype: int64
other       367
services    181
at_home      42
teacher      36
health       23
Name: Fjob, dtype: int64
course        285
home          149
reputation    143
other          72
Name: reason, dtype: int64
mother    455
father    153
other      41
Name: guardian, dtype: int64
1    366
2    213
3     54
4     16
Name: traveltime, dtype: int64
2    305
1    212
3     97
4     35
Name: studytime, dtype: int64
0    549
1     70
2     16
3     14
Name: failures, dtype: int64
no     581
yes     68
Name: schoolsup, dtype: int64

In [11]:
student_transform = student.copy(deep = True)


In [12]:
student_transform.loc[student_transform['Medu'].isin(['0','1','2','3']), 'Mcollege'] = 0
student_transform.loc[student_transform['Medu'] == '4', 'Mcollege'] = 1
student_transform.loc[student_transform['Fedu'].isin(['0','1','2','3']), 'Fcollege'] = 0
student_transform.loc[student_transform['Fedu'] == '4', 'Fcollege'] = 1
student_transform.loc[student_transform['traveltime'].isin(['0','1','2']), 'travelGT30'] = 0
student_transform.loc[student_transform['traveltime'].isin(['4','3']), 'travelGT30'] = 1
print(student_transform['Mcollege'].value_counts())
print(student_transform['Fcollege'].value_counts())
print(student_transform['travelGT30'].value_counts())
student_transform=student_transform.drop(['Medu','Fedu', 'traveltime'], axis = 1)

0.0    474
1.0    175
Name: Mcollege, dtype: int64
0.0    521
1.0    128
Name: Fcollege, dtype: int64
0.0    579
1.0     70
Name: travelGT30, dtype: int64


In [13]:
# Use one hot encoding to get dummy varriables for all categorical variables
categorical = x.select_dtypes(include='category')
cat_columns = categorical.columns
print(cat_columns)
x_encode = pd.get_dummies(x,columns = cat_columns)
x_encode.columns

Index(['school', 'sex', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic'],
      dtype='object')


Index(['age', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health',
       'absences', 'school_GP', 'school_MS', 'sex_F', 'sex_M', 'address_R',
       'address_U', 'famsize_GT3', 'famsize_LE3', 'Pstatus_A', 'Pstatus_T',
       'Medu_0', 'Medu_1', 'Medu_2', 'Medu_3', 'Medu_4', 'Fedu_0', 'Fedu_1',
       'Fedu_2', 'Fedu_3', 'Fedu_4', 'Mjob_at_home', 'Mjob_health',
       'Mjob_other', 'Mjob_services', 'Mjob_teacher', 'Fjob_at_home',
       'Fjob_health', 'Fjob_other', 'Fjob_services', 'Fjob_teacher',
       'reason_course', 'reason_home', 'reason_other', 'reason_reputation',
       'guardian_father', 'guardian_mother', 'guardian_other', 'traveltime_1',
       'traveltime_2', 'traveltime_3', 'traveltime_4', 'studytime_1',
       'studytime_2', 'studytime_3', 'studytime_4', 'failures_0', 'failures_1',
       'failures_2', 'failures_3', 'schoolsup_no', 'schoolsup_yes',
       'famsup_no', 'famsup_yes', 'paid_no', 'paid_yes', 'activities_no',
       'activities_yes', 'nursery_no', 'nurser

In [14]:
imp = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imp.fit(x_encode)
transformed_X = imp.transform(x_encode)

#Need to label encode variables
clf = AdaBoostClassifier(n_estimators=100)
scores = cross_val_score(clf, transformed_X, y.values.ravel(), cv=5)



Index(['school', 'sex', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic'],
      dtype='object')


Unnamed: 0,age,famrel,freetime,goout,Dalc,Walc,health,absences,school_GP,school_MS,...,activities_no,activities_yes,nursery_no,nursery_yes,higher_no,higher_yes,internet_no,internet_yes,romantic_no,romantic_yes
0,18,4,3,4,1,1,3,6,1,0,...,1,0,0,1,0,1,1,0,1,0
1,17,5,3,3,1,1,3,4,1,0,...,1,0,1,0,0,1,0,1,1,0
2,15,4,3,2,2,3,3,10,1,0,...,1,0,0,1,0,1,0,1,1,0
3,15,3,2,2,1,1,5,2,1,0,...,0,1,0,1,0,1,0,1,0,1
4,16,4,3,2,1,2,5,4,1,0,...,1,0,0,1,0,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,20,5,5,4,4,5,4,11,0,1,...,1,0,0,1,0,1,1,0,1,0
391,17,2,4,5,3,4,2,3,0,1,...,1,0,1,0,0,1,0,1,1,0
392,21,5,5,3,3,3,3,3,0,1,...,1,0,1,0,0,1,1,0,1,0
393,18,4,4,1,3,4,5,0,0,1,...,1,0,1,0,0,1,0,1,1,0


In [17]:
x.dtypes

school        category
sex           category
age               int8
address       category
famsize       category
Pstatus       category
Medu          category
Fedu          category
Mjob          category
Fjob          category
reason        category
guardian      category
traveltime    category
studytime     category
failures      category
schoolsup     category
famsup        category
paid          category
activities    category
nursery       category
higher        category
internet      category
romantic      category
famrel            int8
freetime          int8
goout             int8
Dalc              int8
Walc              int8
health            int8
absences          int8
dtype: object