In [0]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from sklearn.metrics import mean_squared_error

In [3]:
"""
FOR COLAB USERS ONLY

Run the following code to upload and unzip the data into the Colab environment.

Please comment out *everything* in this cell (including the import) when submitting your .py file.
"""
from google.colab import files
uploaded = files.upload()

Saving one_hot.csv to one_hot.csv


In [43]:
import io
df = pd.read_csv(io.BytesIO(uploaded['one_hot.csv']))
data = df.as_matrix()

  This is separate from the ipykernel package so we can avoid doing imports until


In [0]:
newY = df["Trip Total"].values
df_ = df.copy()
df_ = df_.drop(columns="Trip Total")
newX = df_.values

In [0]:
def RidgeRegression(train_data, train_labels, lambda_):
    """
    Runs linear regression with L2 regularization (ridge) on the given data.
    
    Args:
        train_data ((n,p) np.array): n is the number of training points and p the number of features
        train_labels ((n,1) np.array): training labels for the training data    
        lambda_  (float): scalar weighting the L2 penalty

    Returns
        tuple: (w, b) where w is a (p,1) weight vector, and b the bias term  
    """

    regressor = Ridge(alpha=lambda_)
    regressor.fit(train_data, train_labels)
    b = regressor.intercept_
    w = regressor.coef_
    
    
    return (w,b)

In [0]:
def SimpleRegression(train_data, train_labels):
    """
    Runs OLS on the given data.
    
    Args:
        train_data ((n,p) np.array): n is the number of training points and p the number of features
        train_labels ((n,1) np.array): training labels for the training data    
    
    Returns
        tuple: (w, b) where w is a (p,1) weight vector, and b the bias term     
    """
    regressor = LinearRegression()  
    regressor.fit(train_data, train_labels)
    b = regressor.intercept_
    w = regressor.coef_
    
    return (w,b)

In [0]:
X_train,X_test,y_train,y_test=train_test_split(newX,newY)

In [88]:
# run cv on linear regression(i.e. lambda = 0)
clf = LinearRegression()  
scores = cross_val_score(clf, X_train, y_train, cv=5)
print("CV score when lambda=0: " + str(np.mean(scores)))

CV score when lambda=0: 0.4157703775761757


In [89]:
# run ridge to determine best alpha; use the alpha that gives the best cv score
clf = RidgeCV(alphas=[0.1,1,10,100,500,1000, 5000, 100000, 10000000], cv=5).fit(X_train, y_train)
print("Best lambda to use: " + str(clf.alpha_))
clf = Ridge(clf.alpha_)  
scores = cross_val_score(clf, X_train, y_train, cv=5)
print("CV score at best lambda: " + str(np.mean(scores)))

Best lambda to use: 5000.0
CV score at best lambda: 0.41577890906003623


Now compute coefficients for linear and ridge regression

In [90]:
w,b = SimpleRegression(X_train, y_train)
y_test_pred = np.matmul(X_test, w) + b
mse = mean_squared_error(y_test, y_test_pred, multioutput='raw_values')
print("coefficient: "+str(w))
print("intercept: "+str(b))
print("MSE error: "+str(mse))

coefficient: [ 1.91301423e-03  1.62313895e+00  1.73443710e+00  1.01032102e+00
 -1.59595651e-01  1.99797200e+00 -1.93502800e-01 -4.12206706e-01
 -5.72694777e-01  3.19255591e-01 -4.79433831e-01 -9.78198271e-01
  3.18808796e-01]
intercept: 4.905299190640349
MSE error: [235.14854152]


In [91]:
w,b = RidgeRegression(X_train, y_train, 10)
y_test_pred = np.matmul(X_test, w) + b
mse = mean_squared_error(y_test, y_test_pred, multioutput='raw_values')
print("coefficient: "+str(w))
print("intercept: "+str(b))
print("MSE error: "+str(mse))

coefficient: [ 1.91302091e-03  1.62313705e+00  1.73443519e+00  1.01032100e+00
 -1.59561916e-01  1.99776539e+00 -1.93521120e-01 -4.12187292e-01
 -5.72624167e-01  3.19156843e-01 -4.79371474e-01 -9.77964091e-01
  3.18745914e-01]
intercept: 4.90533704064312
MSE error: [235.14856428]
