In [1]:
#modules and functions needed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler 
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import *
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier

# Data importing and cleaning

In [2]:
#merge two data into one
mat = pd.read_csv('student-mat.csv')
por = pd.read_csv('student-por.csv')
alco = pd.concat([mat, por])
alco.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [7]:
#Check for missing value
alco.isnull().values.any()

False

In [3]:
#create dummy variable for categorical features
alcoNew = pd.get_dummies(alco, drop_first=True)

#combine G1-G3 into a new feature called "Grade"
alco['Grade'] = alco.G1 + alco.G2 + alco.G3
alcoNew = alcoNew.drop(columns=['G1', 'G2', 'G3'])

alcoNew.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,guardian_mother,guardian_other,schoolsup_yes,famsup_yes,paid_yes,activities_yes,nursery_yes,higher_yes,internet_yes,romantic_yes
0,18,4,4,2,2,0,4,3,4,1,...,1,0,1,0,0,0,1,1,0,0
1,17,1,1,1,2,0,5,3,3,1,...,0,0,0,1,0,0,0,1,1,0
2,15,1,1,1,2,3,4,3,2,2,...,1,0,1,0,1,0,1,1,1,0
3,15,4,2,1,3,0,3,2,2,1,...,1,0,0,1,1,1,1,1,1,1
4,16,3,3,1,2,0,4,3,2,1,...,0,0,0,1,1,0,1,1,0,0


In [4]:
#Reduce the level number of dependent variable Walc, instead of 1 through 5, use 1, 3, 5
for i in range( alcoNew.shape[0]):
    if alcoNew.iloc[i, 10] == 2:
        alcoNew.iloc[i, 10] = 1
    elif alcoNew.iloc[i, 10] == 4:
        alcoNew.iloc[i, 10] = 5

In [5]:
alcoNew.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,guardian_mother,guardian_other,schoolsup_yes,famsup_yes,paid_yes,activities_yes,nursery_yes,higher_yes,internet_yes,romantic_yes
count,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,...,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0
mean,16.726054,2.603448,2.387931,1.522989,1.970307,0.264368,3.935824,3.201149,3.15613,1.494253,...,0.697318,0.069923,0.113985,0.613027,0.210728,0.494253,0.799808,0.914751,0.792146,0.355364
std,1.239975,1.124907,1.099938,0.731727,0.834353,0.656142,0.933401,1.031507,1.152575,0.911714,...,0.459639,0.25514,0.317945,0.487291,0.408021,0.500207,0.400335,0.279386,0.405967,0.478853
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,16.0,2.0,1.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
#split data into 70 30
X = alcoNew.drop(columns=['Dalc', 'Walc'])
y = alcoNew.Walc
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=21)

## Support Vectors Classifier

In [10]:
#create pipeline
steps = [('scaler', StandardScaler()),
         ('SVM', SVC())]
pipeline = Pipeline(steps)

#create hyperparameter space for grid search
parameters = {'SVM__C':[1, 10, 20, 50, 100],
              'SVM__gamma':[0.1, 0.01, 0.001]}

#tune, fit and predict
cv = GridSearchCV(pipeline, parameters, cv=5)
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)

# Compute and print metrics
print("Accuracy: {}".format(cv.score(X_test, y_test)))
print(classification_report(y_test, y_pred))
print("Tuned Model Parameters: {}".format(cv.best_params_))

Accuracy: 0.7898089171974523
             precision    recall  f1-score   support

          1       0.75      0.98      0.85       188
          3       0.92      0.54      0.68        61
          5       0.97      0.46      0.62        65

avg / total       0.83      0.79      0.77       314

Tuned Model Parameters: {'SVM__C': 10, 'SVM__gamma': 0.1}


## Tree model

In [15]:
#create parameter space
param = {"max_depth": randint(1,36),
         "max_features": np.arange(1,18),
         "min_samples_leaf": np.arange(1,18),
         "criterion": ["gini", "entropy"]}
tree = DecisionTreeClassifier()

tree_cv = RandomizedSearchCV(tree, param, cv=5)
tree_cv.fit(X_train,y_train)
tree_pred = tree_cv.predict(X_test)


# Compute and print metrics
print("Accuracy: {}".format(tree_cv.score(X_test, y_test)))
print(classification_report(y_test, tree_pred))
print("Tuned Model Parameters: {}".format(tree_cv.best_params_))

Accuracy: 0.6369426751592356
             precision    recall  f1-score   support

          1       0.73      0.74      0.73       188
          3       0.35      0.36      0.35        61
          5       0.66      0.58      0.62        65

avg / total       0.64      0.64      0.64       314

Tuned Model Parameters: {'criterion': 'entropy', 'max_depth': 25, 'max_features': 17, 'min_samples_leaf': 7}


## Random Forest Model

In [18]:

param_dist = {"n_estimators": np.arange(5,15),
              "max_depth": np.arange(1,36),
              "max_features": np.arange(1,18),
              "min_samples_split": np.arange(2,10),
              "criterion": ["gini", "entropy"]}

rf = RandomForestClassifier()
rf_cv = RandomizedSearchCV(rf, param_dist, cv=5)

rf_cv.fit(X_train,y_train)
rf_pred = rf_cv.predict(X_test)

print("Accuracy: {}".format(rf_cv.score(X_test, y_test)))
print(classification_report(y_test, rf_pred))
print("Tuned Model Parameters: {}".format(rf_cv.best_params_))

Accuracy: 0.7993630573248408
             precision    recall  f1-score   support

          1       0.80      0.95      0.87       188
          3       0.77      0.44      0.56        61
          5       0.82      0.71      0.76        65

avg / total       0.80      0.80      0.79       314

Tuned Model Parameters: {'n_estimators': 14, 'min_samples_split': 2, 'max_features': 17, 'max_depth': 16, 'criterion': 'gini'}


## Use Cross Validation for Models

In [20]:
#get models
svc_best = cv.best_estimator_
tree_best = tree_cv.best_estimator_
rf_best = rf_cv.best_estimator_

In [21]:
#train models with the whole dataset
svc_best.fit(X,y)
tree_best.fit(X,y)
rf_best.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=16, max_features=17, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=14, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [39]:
def find_accuracy(model, fold):
    scores = cross_val_score(model, X, y, cv=fold)
    print("accuracy based on cross validation is: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
    return

In [40]:
print("For SVC model,")
find_accuracy(svc_best, 5)

For SVC model,
accuracy based on cross validation is: 0.870 (+/- 0.194)


In [41]:
print("For tree model,")
find_accuracy(tree_best, 5)

For tree model,
accuracy based on cross validation is: 0.659 (+/- 0.083)


In [42]:
print("For random forest model,")
find_accuracy(tree_best, 5)

For random forest model,
accuracy based on cross validation is: 0.652 (+/- 0.128)
