In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn import preprocessing
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

# Load test scores csv
ts = pd.read_csv('test_scores.csv')

In [2]:
ts.describe()

Unnamed: 0,n_student,pretest,posttest
count,2133.0,2133.0,2133.0
mean,22.796531,54.955931,67.102203
std,4.228893,13.563101,13.986789
min,14.0,22.0,32.0
25%,20.0,44.0,56.0
50%,22.0,56.0,68.0
75%,27.0,65.0,77.0
max,31.0,93.0,100.0


In [3]:
## Pre-Processing Data

# replace school_setting values to Integers
ts['school_setting'] = ts['school_setting'].map({'Urban': 1, 'Suburban': 2, 'Rural': 3})

# replace school_type values to Integers
ts['school_type'] = ts['school_type'].map({'Public': 1, 'Non-public': 0})

# replace teaching_method values to Integers
ts['teaching_method'] = ts['teaching_method'].map({'Standard': 1, 'Experimental': 0})

# replace gender values to Integers
ts['gender'] = ts['gender'].map({'Male': 1, 'Female': 0})

# replace lunch values to Integers
ts['lunch'] = ts['lunch'].map({'Does not qualify': 1, 'Qualifies for reduced/free lunch': 0})
ts

Unnamed: 0,school,school_setting,school_type,classroom,teaching_method,student_id,gender,lunch,n_student,pretest,posttest
0,ANKYI,1,0,6OL,1,2FHT3,0,1,20,62,72
1,ANKYI,1,0,6OL,1,3JIVH,0,1,20,66,79
2,ANKYI,1,0,6OL,1,3XOWE,1,1,20,64,76
3,ANKYI,1,0,6OL,1,556O0,0,1,20,61,77
4,ANKYI,1,0,6OL,1,74LOE,1,1,20,64,76
...,...,...,...,...,...,...,...,...,...,...,...
2128,ZOWMK,1,1,ZBH,1,T8LSK,0,1,30,39,55
2129,ZOWMK,1,1,ZBH,1,VNP26,0,0,30,38,46
2130,ZOWMK,1,1,ZBH,1,YDR1Z,0,0,30,45,51
2131,ZOWMK,1,1,ZBH,1,YUEIH,1,0,30,46,53


In [49]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

def evaluate(y_act,y_pred):
    MAE = mean_absolute_error(y_act, y_pred)
    MSE = mean_squared_error(y_act, y_pred)
    r2 = r2_score(y_act, y_pred)
    return {'MAE': MAE, 'MSE' : MSE, 'r2' : r2}

 <h3>Train test split<h3>

In [50]:
from sklearn.model_selection import train_test_split

X = ts.drop(columns=['pretest', 'school', 'classroom', 'student_id'])
y = ts['pretest']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=17)

X_train

Unnamed: 0,school_setting,school_type,teaching_method,gender,lunch,n_student,posttest
651,3,1,0,1,0,23,66
2044,1,1,0,1,0,30,53
154,1,0,1,1,1,19,68
1649,2,0,1,0,1,20,76
2127,1,1,1,1,0,30,50
...,...,...,...,...,...,...,...
278,1,1,1,0,1,27,64
1965,2,1,0,1,0,24,81
1337,1,1,0,1,0,25,78
406,2,1,1,0,0,20,69


In [51]:
scores = []

In [52]:
from sklearn.preprocessing import StandardScaler

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

In [53]:
from sklearn.linear_model import RidgeCV

ridge_cv = RidgeCV()
ridge_cv.fit(X_train_scaled, y_train)
ridge_cv_predict = ridge_cv.predict(X_valid_scaled)

# Evaluate Ridge 
evaluation = evaluate(y_valid, ridge_cv_predict)
evaluation['name'] = "Ridge"
scores.append(evaluation)

In [54]:
from sklearn.tree import DecisionTreeRegressor

DT = DecisionTreeRegressor()
DT.fit(X_train_scaled, y_train)
DT_pred = DT.predict(X_valid_scaled)

evaluation = evaluate(y_valid, DT_pred)
evaluation['name'] = "DecisionTreeRegressor"

scores.append(evaluation)
type(evaluation)

dict

In [55]:
from sklearn.ensemble import RandomForestRegressor

RF = RandomForestRegressor()
RF.fit(X_train_scaled, y_train)
RF_pred = RF.predict(X_valid_scaled)

evaluation = evaluate(y_valid, RF_pred)
evaluation['name'] = "RandomForestRegressor"
scores.append(evaluation)

In [56]:
from sklearn.svm import SVR

SVR = SVR()
SVR.fit(X_train_scaled, y_train)
SVR_pred = SVR.predict(X_valid_scaled)

evaluation = evaluate(y_valid, SVR_pred)
evaluation['name'] = "SVR"
scores.append(evaluation)

In [57]:
scores_df = pd.DataFrame(scores,columns=['name','MAE','MSE','r2'])
scores_df.sort_values('r2',ascending=False)

Unnamed: 0,name,MAE,MSE,r2
0,Ridge,2.591766,10.46689,0.942987
2,RandomForestRegressor,2.731153,11.884882,0.935263
3,SVR,2.791593,12.674642,0.930962
1,DecisionTreeRegressor,3.45929,18.771844,0.89775
