In [13]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
import seaborn as sns

In [77]:
# define category ordering for ordinal variables
cat_Medu = pd.CategoricalDtype(categories = ['1', '2', '3', '4'], ordered = True)
cat_Fedu = pd.CategoricalDtype(categories = ['1', '2', '3', '4'], ordered = True)
cat_traveltime = pd.CategoricalDtype(categories = ['1', '2', '3', '4'], ordered = True)
cat_studytime = pd.CategoricalDtype(categories = ['1', '2', '3', '4'], ordered = True)
cat_failures = pd.CategoricalDtype(categories = ['1', '2', '3', '4'], ordered = True)

# define variable types
stu_dtypes = {
    'school' : 'category',
    'sex' : 'category',
    'age' : 'int8',
    'address' : 'category',
    'famsize' : 'category',
    'Pstatus' : 'category',
    'Medu' : 'category',
    'Fedu' : 'category',
    'Mjob' : 'category',
    'Fjob' : 'category',
    'reason' : 'category',
    'guardian' : 'category',
    'traveltime' : 'category',
    'studytime' : 'category',
    'failures' : 'category',
    'schoolsup' : 'category',
    'famsup' : 'category',
    'paid' : 'category',
    'activities' : 'category',
    'nursery' : 'category',
    'higher' : 'category',
    'internet' : 'category',
    'romantic' : 'category',
    'famrel' : 'int8',
    'freetime' : 'int8',
    'goout' : 'int8',
    'Dalc' : 'int8',
    'Walc' : 'int8',
    'health' : 'int8',
    'absences' : 'int8',
    'G1' : 'int8',
    'G2' : 'int8',
    'G3' : 'int8'
}

# read in student math performance data set
student = pd.read_csv('data/student/student-mat.csv', sep = ';', dtype = stu_dtypes)
# use G3 as target variable - drop G1 and G2
student = student.drop(['G1', 'G2'], axis = 1)

In [3]:
student.isnull().sum()

school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G1            0
G2            0
G3            0
dtype: int64

In [8]:
student.describe()

Unnamed: 0,age,famrel,freetime,goout,Dalc,Walc,health,absences,G3
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,3.944304,3.235443,3.108861,1.481013,2.291139,3.55443,5.708861,10.41519
std,1.276043,0.896659,0.998862,1.113278,0.890741,1.287897,1.390303,8.003096,4.581443
min,15.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
25%,16.0,4.0,3.0,2.0,1.0,1.0,3.0,0.0,8.0
50%,17.0,4.0,3.0,3.0,1.0,2.0,4.0,4.0,11.0
75%,18.0,5.0,4.0,4.0,2.0,3.0,5.0,8.0,14.0
max,22.0,5.0,5.0,5.0,5.0,5.0,5.0,75.0,20.0


In [16]:
student.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,no,no,4,3,4,1,1,3,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,no,5,3,3,1,1,3,4,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,no,4,3,2,2,3,3,10,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,3,2,2,1,1,5,2,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,no,no,4,3,2,1,2,5,4,10


In [67]:
categorical = x.select_dtypes(include='category')
cat_columns = categorical.columns
for col in cat_columns:
    print(student[col].value_counts())

GP    349
MS     46
Name: school, dtype: int64
F    208
M    187
Name: sex, dtype: int64
U    307
R     88
Name: address, dtype: int64
GT3    281
LE3    114
Name: famsize, dtype: int64
T    354
A     41
Name: Pstatus, dtype: int64
4    131
2    103
3     99
1     59
0      3
Name: Medu, dtype: int64
2    115
3    100
4     96
1     82
0      2
Name: Fedu, dtype: int64
other       141
services    103
at_home      59
teacher      58
health       34
Name: Mjob, dtype: int64
other       217
services    111
teacher      29
at_home      20
health       18
Name: Fjob, dtype: int64
course        145
home          109
reputation    105
other          36
Name: reason, dtype: int64
mother    273
father     90
other      32
Name: guardian, dtype: int64
1    257
2    107
3     23
4      8
Name: traveltime, dtype: int64
2    198
1    105
3     65
4     27
Name: studytime, dtype: int64
0    312
1     50
2     17
3     16
Name: failures, dtype: int64
no     344
yes     51
Name: schoolsup, dtype: int64

In [106]:
student_transform = student.copy(deep = True)


In [110]:
student_transform.loc[student_transform['Medu'].isin(['0','1','2','3']), 'Mcollege'] = 0
student_transform.loc[student_transform['Medu'] == '4', 'Mcollege'] = 1
student_transform.loc[student_transform['Fedu'].isin(['0','1','2','3']), 'Fcollege'] = 0
student_transform.loc[student_transform['Fedu'] == '4', 'Fcollege'] = 1
student_transform.loc[student_transform['traveltime'].isin(['0','1','2']), 'travelGT30'] = 0
student_transform.loc[student_transform['traveltime'].isin(['4','3']), 'travelGT30'] = 1
print(student_transform['Mcollege'].value_counts())
print(student_transform['Fcollege'].value_counts())
print(student_transform['travelGT30'].value_counts())
student_transform=student_transform.drop(['Medu','Fedu', 'traveltime'], axis = 1)

0.0    264
1.0    131
Name: Mcollege, dtype: int64
0.0    299
1.0     96
Name: Fcollege, dtype: int64
0.0    364
1.0     31
Name: travelGT30, dtype: int64


In [35]:
num_vars = student[['age', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G3']]
sns.pairplot(num_vars)

NameError: name 'look_vars' is not defined

In [111]:
#Create initial x and y datasets of interest
x = student_transform.drop(['G3'], axis = 1)
y = student_transform[['G3']]

# Use one hot encoding to get dummy varriables for all categorical variables
categorical = x.select_dtypes(include='category')
cat_columns = categorical.columns
print(cat_columns)
x_encode = pd.get_dummies(x,columns = cat_columns)
x_encode.columns

Index(['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob',
       'reason', 'guardian', 'studytime', 'failures', 'schoolsup', 'famsup',
       'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic'],
      dtype='object')


Index(['age', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health',
       'absences', 'Fcollege', 'Mcollege', 'travelGT30', 'school_GP',
       'school_MS', 'sex_F', 'sex_M', 'address_R', 'address_U', 'famsize_GT3',
       'famsize_LE3', 'Pstatus_A', 'Pstatus_T', 'Mjob_at_home', 'Mjob_health',
       'Mjob_other', 'Mjob_services', 'Mjob_teacher', 'Fjob_at_home',
       'Fjob_health', 'Fjob_other', 'Fjob_services', 'Fjob_teacher',
       'reason_course', 'reason_home', 'reason_other', 'reason_reputation',
       'guardian_father', 'guardian_mother', 'guardian_other', 'studytime_1',
       'studytime_2', 'studytime_3', 'studytime_4', 'failures_0', 'failures_1',
       'failures_2', 'failures_3', 'schoolsup_no', 'schoolsup_yes',
       'famsup_no', 'famsup_yes', 'paid_no', 'paid_yes', 'activities_no',
       'activities_yes', 'nursery_no', 'nursery_yes', 'higher_no',
       'higher_yes', 'internet_no', 'internet_yes', 'romantic_no',
       'romantic_yes'],
      dtype='object')

In [113]:
imp = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imp.fit(x_encode)
transformed_X = imp.transform(x_encode)

#Need to label encode variables
clf = AdaBoostClassifier(n_estimators=100)
scores = cross_val_score(clf, transformed_X, y.values.ravel(), cv=5)



Index(['school', 'sex', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic'],
      dtype='object')


Unnamed: 0,age,famrel,freetime,goout,Dalc,Walc,health,absences,school_GP,school_MS,...,activities_no,activities_yes,nursery_no,nursery_yes,higher_no,higher_yes,internet_no,internet_yes,romantic_no,romantic_yes
0,18,4,3,4,1,1,3,6,1,0,...,1,0,0,1,0,1,1,0,1,0
1,17,5,3,3,1,1,3,4,1,0,...,1,0,1,0,0,1,0,1,1,0
2,15,4,3,2,2,3,3,10,1,0,...,1,0,0,1,0,1,0,1,1,0
3,15,3,2,2,1,1,5,2,1,0,...,0,1,0,1,0,1,0,1,0,1
4,16,4,3,2,1,2,5,4,1,0,...,1,0,0,1,0,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,20,5,5,4,4,5,4,11,0,1,...,1,0,0,1,0,1,1,0,1,0
391,17,2,4,5,3,4,2,3,0,1,...,1,0,1,0,0,1,0,1,1,0
392,21,5,5,3,3,3,3,3,0,1,...,1,0,1,0,0,1,1,0,1,0
393,18,4,4,1,3,4,5,0,0,1,...,1,0,1,0,0,1,0,1,1,0


In [17]:
x.dtypes

school        category
sex           category
age               int8
address       category
famsize       category
Pstatus       category
Medu          category
Fedu          category
Mjob          category
Fjob          category
reason        category
guardian      category
traveltime    category
studytime     category
failures      category
schoolsup     category
famsup        category
paid          category
activities    category
nursery       category
higher        category
internet      category
romantic      category
famrel            int8
freetime          int8
goout             int8
Dalc              int8
Walc              int8
health            int8
absences          int8
dtype: object