# Classification and Regression Trees (CART)
Classifing student success data by means of the [Decision Tree Classifier](http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html) from the sklearn module.

## Import Data
Import the data into a pandas dataframe. Get dummy variables for each categorical predictor in the data set and return the design matirx. Create a normalized and standardized design matrix as well to compare model preformance. Convert response variable to three classes *0 , 1,* and *2*.

In [None]:
import time
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.extmath import cartesian
from sklearn import metrics
from sklearn import preprocessing

df = pd.read_csv('student-por2.csv')
df = pd.get_dummies(df)#, drop_first=True)

def response_conv(arr):
    new = []
    for i in arr:
        if (i > 0 and i < 10):           # condition where student failed
            new.append(0)                 
                                          
        elif (i >= 10):                   # condition where student passed
            new.append(1)                 
    
        else:                             # condition where student received an incomplete
            new.append(2)
    return(new)                           # 1-dimensional response varibale returned

X = df.drop('G3',1)                       # This is the design matrix
y = list(df.G3)                           # This is the discrete response vector
y_new = response_conv(y)                  # This is the multinomial response vector

clf = DecisionTreeClassifier()
clf.fit(X,y)

model = SelectFromModel(clf,prefit=True)
newX = model.transform(X)

X_scale = preprocessing.scale(newX)
X_norm = preprocessing.normalize(newX)



## Optimal Parameters for Decision Tree Classifier Algorithm
We choose the combination of parameters that minimize the negative log loss metric. Return optimal parameters and total run time for the cross validating process.

In [None]:
random.seed(1)
X1_train, X1_test, y1_train, y1_test = train_test_split(newX, y_new, test_size=0.33, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X_scale, y_new, test_size=0.33, random_state=42)
X3_train, X3_test, y3_train, y3_test = train_test_split(X_norm, y_new, test_size=0.33, random_state=42)

start_time = time.time()
combos = cartesian([['gini','entropy'],['best','random'],['auto','log2'],np.arange(1,(X1_train.shape[0]-1))])

def opt(X,y):
    log_lo = []

    for c,s,mf,md in combos:
        dt = DecisionTreeClassifier(criterion=c,splitter=s,max_features=mf,max_depth=int(md))
        #fits = dt.fit(X,y)
        #y_hat = fits.predict(X,y)
        scores = cross_val_score(dt, X, y, cv=10, scoring='neg_log_loss')
        log_lo.append(scores.mean())
    
    #MSE = [1 - x for x in cv_scores]
    opt_ = combos[log_lo.index(min(log_lo))]
    return(opt_)

c1,s1,mf1,md1 = opt(X1_train,y1_train)
c2,s2,mf2,md2 = opt(X2_train,y2_train)
c3,s3,mf3,md3 = opt(X3_train,y3_train)

print ("The optimal criterion, splitter, max_features and max_depth are %s, %s, %s, and %r respectively for Non-standardized design matrix." % (str(c1),str(s1),str(mf1),int(md1)))
print ("The optimal criterion, splitter, max_features and max_depth are %s, %s, %s, and %r respectively for Standardized design matrix." % (str(c2),str(s2),str(mf2),int(md2)))
print ("The optimal criterion, splitter, max_features and max_depth are %s, %s, %s, and %r respectively for Normalized design matrix." % (str(c3),str(s3),str(mf3),int(md3)))
print("Run time: %r minutes" % (int(time.time() - start_time)/60))

## Fit and Predict
After tuning model parameters to be optimal we fit each design matrix to its optimal model. Predictions are made and returned in a data frame for comparison.

In [None]:
dt1 = DecisionTreeClassifier(criterion=c1,splitter=s1,max_features=mf1,max_depth=int(md1)).fit(X1_train,y1_train)
dt2 = DecisionTreeClassifier(criterion=c2,splitter=s2,max_features=mf2,max_depth=int(md2)).fit(X2_train,y2_train)
dt3 = DecisionTreeClassifier(criterion=c3,splitter=s3,max_features=mf3,max_depth=int(md3)).fit(X3_train,y3_train)

dt_pred1 = dt1.predict(X1_test)
dt_pred2 = dt2.predict(X2_test)
dt_pred3 = dt3.predict(X3_test)

pred = pd.DataFrame(list(zip(y1_test, dt_pred1, dt_pred2, dt_pred3)), columns=['y_act','y_dt','y_dt_stan','y_dt_norm'])
pred.index.name = 'Obs'

# remove comment below to save the predictions in a csv file and view the full data frame in excel
#pred.to_csv("preds.csv")
pred

## Results
Accuracy, confusion matrix, and classification reports are returned for each design matirx.

In [None]:
cm_dt1 = pd.DataFrame(metrics.confusion_matrix(y1_test, dt_pred1), index = ['Fail(0)','Pass(1)','Inc(2)'],columns=['Fail(0)','Pass(1)','Inc(2)'])
cm_dt2 = pd.DataFrame(metrics.confusion_matrix(y2_test, dt_pred2), index = ['Fail(0)','Pass(1)','Inc(2)'],columns=['Fail(0)','Pass(1)','Inc(2)'])
cm_dt3 = pd.DataFrame(metrics.confusion_matrix(y3_test, dt_pred3), index = ['Fail(0)','Pass(1)','Inc(2)'],columns=['Fail(0)','Pass(1)','Inc(2)'])

zero = 0
one = 0
two = 0
for i in y1_train:
    if i == 0:
        zero += 1
    elif i == 1:
        one += 1
    else:
        two += 1
num1 = round(zero/len(y1_train),2)
num2 = round(one/len(y1_train),2)
num3 = round(two/len(y1_train),2)
print("The response vector has the following distribution: \nzeros: %r \nones: %r \ntwos: %r" % (num1,num2,num3))
print("\n")

print ("The accuracy of the Non-standardized Decision Tree model is: ", dt1.score(X1_test,y1_test))
print ("\n")
print ("The accuracy of the Standardized Decision Tree model is: ", dt2.score(X2_test,y2_test))
print ("\n")
print ("The accuracy of the Normalized Decision Tree model is: ", dt3.score(X3_test,y3_test))
print ("\n")

print("Non-standardized Decision Tree Confusion Matrix: \n", cm_dt1)
print ("\n")
print("Standardized Decision Tree Confusion Matrix: \n", cm_dt2)
print ("\n")
print("Normalized Decision Tree Confusion Matrix: \n", cm_dt3)
print ("\n")

print("Classification report for Non-standardized design matrix:\n", metrics.classification_report(y1_test,dt_pred1))
print("\n")
print("Classification report for standardized design matrix:\n", metrics.classification_report(y2_test,dt_pred2))
print("\n")
print("Classification report for Normalized design matrix:\n", metrics.classification_report(y3_test,dt_pred3))