In [26]:
#modules and functions needed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler 
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.model_selection import *
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier

# Data importing and cleaning

In [56]:
#merge two data into one
mat = pd.read_csv('student-mat.csv')
por = pd.read_csv('student-por.csv')

In [57]:
#Check for missing value
print(mat.isnull().values.any())
print(por.isnull().values.any())

False
False


In [58]:
#create dummy variable for categorical features, change G3 into binary categorical Pass/Fail
def data_reform(df):
    df = pd.get_dummies(df, drop_first=True)
    for i in range(df.shape[0]):
        if df.iloc[i, 15] < 10:
            df.iloc[i, 15] = "Fail"
        else:
            df.iloc[i, 15] = "Pass"
    return df

mat = data_reform(mat)
por = data_reform(por)

#print(mat.head())
#print(por.head())

In [93]:
#split math data into 70 30
ymat = mat.G3
Xmat = mat.drop(columns=['G1', 'G2', 'G3'])
print(sum(ymat=="Pass"), sum(ymat=="Fail"))
#Xmat_train, Xmat_test, ymat_train, ymat_test = train_test_split(Xmat,ymat,test_size=0.3, random_state=21)

265 130


In [71]:
#split por data into 70 30
Xpor = por.drop(columns=['G1', 'G2', 'G3'])
ypor = por.G3
Xpor_train, Xpor_test, ypor_train, ypor_test = train_test_split(Xpor,ypor,test_size=0.3, random_state=22)

In [130]:
def find_accuracy(model, fold, X, y):
    scores = cross_val_score(model, X, y, cv=fold)
    
    print("accuracy based on cross validation is: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
    return

## Support Vectors Classifier

In [194]:
def SVC_Grid_Search(X, y, c_vals, gamma_vals, rs=22, testSize = 0.3):
    #create pipeline
    steps = [('scaler', StandardScaler()),
             ('SVM', SVC())]
    pipeline = Pipeline(steps)
    #create hyperparameter space for grid search
    parameters = {'SVM__C':c_vals,
                  'SVM__gamma':gamma_vals}
    
    #Split data into 70 30
    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=testSize, random_state=rs)
    
    #Grid Search
    cv = GridSearchCV(pipeline, parameters, cv=10)
    cv.fit(X_train, y_train)
    y_pred = cv.predict(X_test)

    # Compute and print metrics
    print("Accuracy: {}".format(cv.score(X_test, y_test)))
    print(classification_report(y_test, y_pred))
    print("Tuned Model Parameters: {}".format(cv.best_params_))
    best = cv.best_estimator_
    best.fit(X,y)
    return best

### Math case

In [195]:
#preliminary C and gama
cm1 = [1, 10, 100]
gm1 = [1, 0.1, 0.01, 0.001]
svm1 = SVC_Grid_Search(Xmat, ymat, cm1, gm1)

Accuracy: 0.7310924369747899
             precision    recall  f1-score   support

       Fail       0.60      0.18      0.27        34
       Pass       0.74      0.95      0.84        85

avg / total       0.70      0.73      0.67       119

Tuned Model Parameters: {'SVM__C': 10, 'SVM__gamma': 0.001}


Look further into the hyper parameters c and gamma around given value.

In [196]:
#Closer look at C and gama
cm2 = [10,15,20]
gm2 = [0.0001,0.0008,0.0009,0.001, 0.0012, 0.0015]
svm2 = SVC_Grid_Search(Xmat, ymat, cm2, gm2)

Accuracy: 0.7310924369747899
             precision    recall  f1-score   support

       Fail       0.60      0.18      0.27        34
       Pass       0.74      0.95      0.84        85

avg / total       0.70      0.73      0.67       119

Tuned Model Parameters: {'SVM__C': 10, 'SVM__gamma': 0.0009}


In [197]:
find_accuracy(svm1, 5, Xmat, ymat)
find_accuracy(svm2, 5, Xmat, ymat)

accuracy based on cross validation is: 0.706 (+/- 0.063)
accuracy based on cross validation is: 0.709 (+/- 0.058)


### Portuguese case

In [198]:
#preliminary C and gama
cm1 = [1, 10, 100]
gm1 = [1, 0.1, 0.01, 0.001]
svm3 = SVC_Grid_Search(Xpor, ypor, cm1, gm1)

Accuracy: 0.8358974358974359
             precision    recall  f1-score   support

       Fail       0.39      0.25      0.30        28
       Pass       0.88      0.93      0.91       167

avg / total       0.81      0.84      0.82       195

Tuned Model Parameters: {'SVM__C': 10, 'SVM__gamma': 0.01}


Look further into the hyper parameters c and gamma around given value.

In [199]:
#Closer look at C and gama
cm3 = [4, 5,10,15,20]
gm3 = [0.007,0.008,0.009,0.01, 0.012, 0.015]
svm4 = SVC_Grid_Search(Xpor, ypor, cm3, gm3)

Accuracy: 0.841025641025641
             precision    recall  f1-score   support

       Fail       0.41      0.25      0.31        28
       Pass       0.88      0.94      0.91       167

avg / total       0.81      0.84      0.82       195

Tuned Model Parameters: {'SVM__C': 5, 'SVM__gamma': 0.01}


In [200]:
find_accuracy(svm3, 5, Xpor, ypor)
find_accuracy(svm4, 5, Xpor, ypor)

accuracy based on cross validation is: 0.809 (+/- 0.166)
accuracy based on cross validation is: 0.809 (+/- 0.155)


## Tree model

In [272]:
def Tree_Grid_Search(X, y, depth, feature, minleaf, rs=22, testSize = 0.3):
    #create parameter space
    param = {"max_depth": depth,
             "max_features": feature,
             "min_samples_leaf": minleaf}
    tree = DecisionTreeClassifier()
    
    #Split data into 70 30
    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=testSize, random_state=rs)

    tree_cv = RandomizedSearchCV(tree, param, cv=5)
    tree_cv.fit(X_train,y_train)
    tree_pred = tree_cv.predict(X_test)


    # Compute and print metrics
    print("Accuracy: {}".format(tree_cv.score(X_test, y_test)))
    print(classification_report(y_test, tree_pred))
    print("Tuned Model Parameters: {}".format(tree_cv.best_params_))
    best = tree_cv.best_estimator_
    best.fit(X,y)
    return best

### Math case

In [273]:
dt1 = np.arange(1,42)
ft1 = np.arange(1,21)
mt1 = np.arange(1,21)
tree1 = Tree_Grid_Search(Xmat, ymat, dt1, ft1, mt1)
find_accuracy(tree1, 5, Xmat, ymat)

Accuracy: 0.6554621848739496
             precision    recall  f1-score   support

       Fail       0.37      0.29      0.33        34
       Pass       0.74      0.80      0.77        85

avg / total       0.63      0.66      0.64       119

Tuned Model Parameters: {'min_samples_leaf': 19, 'max_features': 20, 'max_depth': 41}
accuracy based on cross validation is: 0.656 (+/- 0.074)


### Portuguese case

In [274]:
tree2 = Tree_Grid_Search(Xpor, ypor, dt1, ft1, mt1)
find_accuracy(tree1, 5, Xpor, ypor)

Accuracy: 0.8205128205128205
             precision    recall  f1-score   support

       Fail       0.18      0.07      0.10        28
       Pass       0.86      0.95      0.90       167

avg / total       0.76      0.82      0.79       195

Tuned Model Parameters: {'min_samples_leaf': 16, 'max_features': 15, 'max_depth': 33}
accuracy based on cross validation is: 0.815 (+/- 0.124)


## Random Forest Model

In [283]:
def RF_Grid_Search(X, y, ne, depth, feature, ms, rs=22, testSize = 0.3):
    param_dist = {"n_estimators": ne,
                  "max_depth": depth,
                  "max_features": feature,
                  "min_samples_split": ms,
                  "bootstrap": [True, False]}

    rf = RandomForestClassifier()
    rf_cv = RandomizedSearchCV(rf, param_dist, cv=5)

    #Split data into 70 30
    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=testSize, random_state=rs)
    
    rf_cv.fit(X_train,y_train)
    rf_pred = rf_cv.predict(X_test)

    print("Accuracy: {}".format(rf_cv.score(X_test, y_test)))
    print(classification_report(y_test, rf_pred))
    print("Tuned Model Parameters: {}".format(rf_cv.best_params_))
    best = rf_cv.best_estimator_
    best.fit(X,y)
    return best

### Math case

In [286]:
nr1 = [10, 100, 120, 150, 200]
dr1 = np.arange(1,42)
fr1 = np.arange(1,21)
mr1 = np.arange(2,21)
rf1 = RF_Grid_Search(Xmat, ymat, nr1, dr1, fr1, mr1)
find_accuracy(rf1, 5, Xmat, ymat)

Accuracy: 0.7226890756302521
             precision    recall  f1-score   support

       Fail       0.53      0.29      0.38        34
       Pass       0.76      0.89      0.82        85

avg / total       0.69      0.72      0.69       119

Tuned Model Parameters: {'n_estimators': 100, 'min_samples_split': 11, 'max_features': 20, 'max_depth': 39, 'bootstrap': True}
accuracy based on cross validation is: 0.711 (+/- 0.090)


In [287]:
nr2 = [10, 100, 200, 250, 300, 400]
rf2 = RF_Grid_Search(Xmat, ymat, nr2, dr1, fr1, mr1)
find_accuracy(rf2, 5, Xmat, ymat)

Accuracy: 0.7226890756302521
             precision    recall  f1-score   support

       Fail       0.53      0.29      0.38        34
       Pass       0.76      0.89      0.82        85

avg / total       0.69      0.72      0.69       119

Tuned Model Parameters: {'n_estimators': 100, 'min_samples_split': 5, 'max_features': 20, 'max_depth': 30, 'bootstrap': True}
accuracy based on cross validation is: 0.701 (+/- 0.067)


### Portuguese case

In [288]:
rf3 = RF_Grid_Search(Xpor, ypor, nr1, dr1, fr1, mr1)
find_accuracy(rf3, 5, Xpor, ypor)

Accuracy: 0.841025641025641
             precision    recall  f1-score   support

       Fail       0.41      0.25      0.31        28
       Pass       0.88      0.94      0.91       167

avg / total       0.81      0.84      0.82       195

Tuned Model Parameters: {'n_estimators': 150, 'min_samples_split': 18, 'max_features': 17, 'max_depth': 20, 'bootstrap': True}
accuracy based on cross validation is: 0.812 (+/- 0.115)
