In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split, GridSearchCV
%matplotlib inline

In [23]:
# read in data
df = pd.read_csv("../data/cleaned_data/cleaned.csv")
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,0,1,18,1,1,0,4,4,4,0,...,4,3,4,1,1,3,6,5,6,6
1,0,1,17,1,1,1,1,1,4,1,...,5,3,3,1,1,3,4,5,5,6
2,0,1,15,1,0,1,1,1,4,1,...,4,3,2,2,3,3,10,7,8,10
3,0,1,15,1,1,1,4,2,3,2,...,3,2,2,1,1,5,2,15,14,15
4,0,1,16,1,1,1,3,3,1,1,...,4,3,2,1,2,5,4,6,10,10


In [24]:
# set up training, validation, and test sets
X = df.iloc[:,:32]
Y = df.iloc[:,32]

In [25]:
Xtrain, Xtest, Ytrain, ytest = train_test_split(X, Y, test_size = 0.26, shuffle = True, random_state = 13)

In [26]:
xtrain, xval, ytrain, yval = train_test_split(Xtrain, Ytrain, test_size = 0.5, shuffle = True, random_state = 13)

In [41]:
# fit unregularized linear regression model
lm = LinearRegression()
lm.fit(xtrain, ytrain)

## create MSE function
mean_squared_err = lambda y, yhat: np.mean((y-yhat)**2)

print("===Performance on training set===")
print("Training error:", mean_squared_err(ytrain, lm.predict(xtrain)))
print("R-squared:", round(lm.score(xtrain, ytrain), 3))
print()
print("===Performance on validation set===")
print("Validation error:", mean_squared_err(yval, lm.predict(xval)))
print("R-squared:", round(lm.score(xval, yval), 3))

===Performance on training set===
Training error: 2.481157998795698
R-squared: 0.895

===Performance on validation set===
Validation error: 5.572302870116987
R-squared: 0.781


In [34]:
# fit L-1 regularized (Lasso) linear regression model
## apply cross validation to choose best alpha
parameters = {'alpha':(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20)}
l1_lm = Lasso()
cv = GridSearchCV(l1_lm, parameters)
cv.fit(xtrain, ytrain)
print("Best alpha is:", cv.best_params_)

Best alpha is: {'alpha': 1}


In [40]:
l1_best = Lasso(alpha=1)
l1_best.fit(xtrain, ytrain)

print("===Performance on training set===")
print("Training error:", mean_squared_err(ytrain, l1_best.predict(xtrain)))
print("R-squared:", round(l1_best.score(xtrain, ytrain), 3))
print()
print("===Performance on validation set===")
print("Validation error:", mean_squared_err(yval, l1_best.predict(xval)))
print("R-squared:", round(l1_best.score(xval, yval), 3))

===Performance on training set===
Training error: 3.692358862991513
R-squared: 0.844

===Performance on validation set===
Validation error: 4.666219463984615
R-squared: 0.816


In [42]:
# fit L-2 regularized (Ridge) linear regression model
## apply cross validation to choose best alpha
l2 = Ridge()
cv = GridSearchCV(l2, parameters)
cv.fit(xtrain, ytrain)
print("Best alpha is:", cv.best_params_)

Best alpha is: {'alpha': 20}


In [43]:
l2_best = Ridge(alpha=20)
l2_best.fit(xtrain, ytrain)

print("===Performance on training set===")
print("Training error:", mean_squared_err(ytrain, l2_best.predict(xtrain)))
print("R-squared:", round(l2_best.score(xtrain, ytrain), 3))
print()
print("===Performance on validation set===")
print("Validation error:", mean_squared_err(yval, l2_best.predict(xval)))
print("R-squared:", round(l2_best.score(xval, yval), 3))

===Performance on training set===
Training error: 2.586508130199553
R-squared: 0.891

===Performance on validation set===
Validation error: 5.066755288157446
R-squared: 0.8
