In [99]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score, mean_absolute_error, accuracy_score
math_data = pd.read_csv("student-mat.csv")

In [100]:
math_data.drop(['address', 'school', 'age', 'nursery', 'famsize', 'failures', 'Mjob', 'romantic', 'Fjob', 'reason', 'goout', 'guardian', 'famsup', 'Dalc', 'Walc', 'G2'], axis = 1, inplace=True)
#math_data.drop(['address', 'G2'], axis=1, inplace=True)

In [101]:
data_dum = pd.get_dummies(math_data)

In [102]:
data_dum.head()

Unnamed: 0,Medu,Fedu,traveltime,studytime,famrel,freetime,health,absences,G1,G3,...,schoolsup_no,schoolsup_yes,paid_no,paid_yes,activities_no,activities_yes,higher_no,higher_yes,internet_no,internet_yes
0,4,4,2,2,4,3,3,6,5,6,...,0,1,1,0,1,0,0,1,1,0
1,1,1,1,2,5,3,3,4,5,6,...,1,0,1,0,1,0,0,1,0,1
2,1,1,1,2,4,3,3,10,7,10,...,0,1,0,1,1,0,0,1,0,1
3,4,2,1,3,3,2,5,2,15,15,...,1,0,0,1,0,1,0,1,0,1
4,3,3,1,2,4,3,5,4,6,10,...,1,0,0,1,1,0,0,1,1,0


In [103]:
data_dum.drop(['Pstatus_A', 'schoolsup_no', 'paid_no', 'activities_no', 'internet_no', 'higher_no'], axis=1, inplace=True)

In [104]:
data_dum.head(10)

Unnamed: 0,Medu,Fedu,traveltime,studytime,famrel,freetime,health,absences,G1,G3,sex_F,sex_M,Pstatus_T,schoolsup_yes,paid_yes,activities_yes,higher_yes,internet_yes
0,4,4,2,2,4,3,3,6,5,6,1,0,0,1,0,0,1,0
1,1,1,1,2,5,3,3,4,5,6,1,0,1,0,0,0,1,1
2,1,1,1,2,4,3,3,10,7,10,1,0,1,1,1,0,1,1
3,4,2,1,3,3,2,5,2,15,15,1,0,1,0,1,1,1,1
4,3,3,1,2,4,3,5,4,6,10,1,0,1,0,1,0,1,0
5,4,3,1,2,5,4,5,10,15,15,0,1,1,0,1,1,1,1
6,2,2,1,2,4,4,3,0,12,11,0,1,1,0,0,0,1,1
7,4,4,2,2,4,1,1,6,6,6,1,0,0,1,0,0,1,0
8,3,2,1,2,4,2,1,0,16,19,0,1,0,0,1,0,1,1
9,3,4,1,2,5,5,5,0,14,15,0,1,1,0,1,1,1,1


In [105]:
data_dum.columns

Index(['Medu', 'Fedu', 'traveltime', 'studytime', 'famrel', 'freetime',
       'health', 'absences', 'G1', 'G3', 'sex_F', 'sex_M', 'Pstatus_T',
       'schoolsup_yes', 'paid_yes', 'activities_yes', 'higher_yes',
       'internet_yes'],
      dtype='object')

In [106]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(data_dum, 
                        test_size=0.2, random_state=123)
print('Train size: ', len(train_set), 'Test size: ', len(test_set))

Train size:  316 Test size:  79


In [107]:
X = data_dum.drop('G3', axis=1)
y = data_dum.G3.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)


In [108]:
pipeline_linreg=Pipeline([('scalar1', StandardScaler()),('Linreg', LinearRegression())]) 

In [109]:
pipeline_lass=Pipeline([('scalar2', StandardScaler()),('Lasso', Lasso(alpha=.73))]) 

In [110]:
pipeline_ranfor=Pipeline([('scalar3', StandardScaler()),('ranfor', RandomForestRegressor(n_estimators=100))]) 

In [111]:
pipeline_nn=Pipeline([('scalar4', StandardScaler()), ('NN', MLPRegressor())])

In [112]:
pipelines = [pipeline_linreg, pipeline_lass, pipeline_ranfor, pipeline_nn]

In [113]:
pip_dict = {0: 'Linear Regression', 1: 'Lasso Regression', 2: 'Random Forest Regressor', 3: 'NN'}

In [114]:
for pipe in pipelines:
    pipe.fit(X_train, y_train)



In [115]:
for i,model in enumerate(pipelines):
    print("{} train accuracy {}".format(pip_dict[i], model.score(X_train,y_train)), ",{} test accuracy {}".format(pip_dict[i], model.score(X_test,y_test)))
    


Linear Regression train accuracy 0.6562681262265416 ,Linear Regression test accuracy 0.7084566481016079
Lasso Regression train accuracy 0.5933856610897665 ,Lasso Regression test accuracy 0.6739290589440456
Random Forest Regressor train accuracy 0.9556319642287939 ,Random Forest Regressor test accuracy 0.8319284301939098
NN train accuracy 0.6634451179292444 ,NN test accuracy 0.652790198025547


In [116]:
np.mean(cross_val_score(pipeline_linreg,X_train,y_train,scoring  = 'neg_mean_absolute_error'))

-2.060425250031434

In [118]:
np.mean(cross_val_score(pipeline_lass,X_train,y_train,scoring  = 'neg_mean_absolute_error'))

-1.9300015925604086

In [121]:
np.mean(cross_val_score(pipeline_ranfor,X_train,y_train,scoring  = 'neg_mean_absolute_error'))

-1.889109623015873

In [123]:
np.mean(cross_val_score(pipeline_nn,X_train,y_train,scoring  = 'neg_mean_absolute_error'))



-2.348797243913731

In [124]:
y_pred = pipeline_linreg.predict(X_train)
print('R^2 ', r2_score(y_train,y_pred))
print('MAE is ', mean_absolute_error(y_train, y_pred))


R^2  0.6562681262265416
MAE is  1.9248930907105402


In [125]:
y_pred = pipeline_lass.predict(X_train)
print('R^2 ', r2_score(y_train,y_pred))
print('MAE is ', mean_absolute_error(y_train, y_pred))

R^2  0.5933856610897665
MAE is  1.9131764469432753


In [126]:
y_pred = pipeline_ranfor.predict(X_train)
print('R^2 ', r2_score(y_train,y_pred))
print('MAE is ', mean_absolute_error(y_train, y_pred))

R^2  0.9556319642287939
MAE is  0.6945886075949368


In [127]:
y_pred = pipeline_nn.predict(X_train)
print('R^2 ', r2_score(y_train,y_pred))
print('MAE is ', mean_absolute_error(y_train, y_pred))

R^2  0.6634451179292444
MAE is  1.9173958742213506


In [128]:
y_test_pred = pipeline_linreg.predict(X_test)
print('R^2 ', r2_score(y_test,y_test_pred))
print('MAE is ', mean_absolute_error(y_test, y_test_pred))

R^2  0.7084566481016079
MAE is  2.046475419552742


In [129]:
y_test_pred = pipeline_lass.predict(X_test)
print('R^2 ', r2_score(y_test,y_test_pred))
print('MAE is ', mean_absolute_error(y_test, y_test_pred))

R^2  0.6739290589440456
MAE is  1.9559023043244776


In [130]:
y_test_pred = pipeline_ranfor.predict(X_test)
print('R^2 ', r2_score(y_test,y_test_pred))
print('MAE is ', mean_absolute_error(y_test, y_test_pred))

R^2  0.8319284301939098
MAE is  1.4359493670886074


In [131]:
y_test_pred = pipeline_nn.predict(X_test)
print('R^2 ', r2_score(y_test,y_test_pred))
print('MAE is ', mean_absolute_error(y_test, y_test_pred))

R^2  0.652790198025547
MAE is  2.2050079089127803
