In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [69]:
df = pd.read_csv("notebooks_final/DATA/Advertising.csv")
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [70]:
X = df.drop("sales", axis=1)
y = df["sales"]

In [71]:
# Split the data into training and other sets
from sklearn.model_selection import train_test_split
X_train, X_other, y_train, y_other = train_test_split(X, y, test_size=0.3, random_state=101)
X_eval, X_test, y_eval, y_test = train_test_split(X_other, y_other, test_size=0.5, random_state=101)

In [72]:
len(df), len(X_train), len(X_eval), len(X_test)

(200, 140, 30, 30)

In [73]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_eval_scaled = scaler.transform(X_eval)

In [74]:
from sklearn.linear_model import Ridge
model1 = Ridge(alpha=100)
model1.fit(X_train_scaled, y_train)

Ridge(alpha=100)

In [75]:
y_eval_pred = model1.predict(X_eval_scaled)

In [76]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_eval, y_eval_pred)
mse

7.320101458823871

In [77]:
model2 = Ridge(alpha=1)
model2.fit(X_train_scaled, y_train)

Ridge(alpha=1)

In [78]:
new_pred_eval = model2.predict(X_eval_scaled)
mse = mean_squared_error(y_eval, new_pred_eval)
mse

2.3837830750569853

In [79]:
# Finally, use the test set to evaluate the model
final_pred = model2.predict(X_test_scaled)
mse = mean_squared_error(y_test, final_pred)
mse

2.2542600838005176

# Cross Validate

In [80]:
X = df.drop("sales", axis=1)
y = df["sales"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

# Scale data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [81]:
from sklearn.model_selection import cross_validate
model = Ridge(alpha=100)
scores = cross_validate(model, X_train_scaled, y_train, cv=10, scoring=["neg_mean_squared_error","neg_mean_absolute_error"])
scores

{'fit_time': array([0.00156403, 0.00063014, 0.00088382, 0.00059628, 0.00061488,
        0.00053477, 0.00036407, 0.00032187, 0.0003159 , 0.00031471]),
 'score_time': array([0.00041103, 0.00033212, 0.00040412, 0.00034475, 0.00057387,
        0.00030422, 0.00026608, 0.00025797, 0.00026131, 0.00025606]),
 'test_neg_mean_squared_error': array([ -6.06067062, -10.62703078,  -3.99342608,  -5.00949402,
         -9.14179955, -13.08625636,  -3.83940454,  -9.05878567,
         -9.05545685,  -5.77888211]),
 'test_neg_mean_absolute_error': array([-1.8102116 , -2.54195751, -1.46959386, -1.86276886, -2.52069737,
        -2.45999491, -1.45197069, -2.37739501, -2.44334397, -1.89979708])}

In [82]:
scores = pd.DataFrame(scores)
scores.sort_values(by="test_neg_mean_squared_error", ascending=False)

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error
6,0.000364,0.000266,-3.839405,-1.451971
2,0.000884,0.000404,-3.993426,-1.469594
3,0.000596,0.000345,-5.009494,-1.862769
9,0.000315,0.000256,-5.778882,-1.899797
0,0.001564,0.000411,-6.060671,-1.810212
8,0.000316,0.000261,-9.055457,-2.443344
7,0.000322,0.000258,-9.058786,-2.377395
4,0.000615,0.000574,-9.1418,-2.520697
1,0.00063,0.000332,-10.627031,-2.541958
5,0.000535,0.000304,-13.086256,-2.459995


In [83]:
scores.mean()

fit_time                        0.000614
score_time                      0.000341
test_neg_mean_squared_error    -7.565121
test_neg_mean_absolute_error   -2.083773
dtype: float64

In [84]:
model = Ridge(alpha=1)
scores = cross_validate(model, X_train_scaled, y_train, cv=10, scoring=["neg_mean_squared_error","neg_mean_absolute_error"])
scores = pd.DataFrame(scores)
scores.sort_values(by="test_neg_mean_squared_error", ascending=False)

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error
3,0.000435,0.000665,-0.833034,-0.768938
6,0.000418,0.000298,-1.905864,-1.081362
2,0.000564,0.000364,-2.17374,-1.23877
7,0.000363,0.000279,-2.765048,-1.250011
9,0.00034,0.000269,-2.846438,-1.223326
0,0.000858,0.000408,-2.962508,-1.457174
1,0.000744,0.000354,-3.057378,-1.555308
4,0.000447,0.000364,-3.464018,-1.434489
8,0.000344,0.000273,-4.989505,-1.580971
5,0.000413,0.000397,-8.232647,-1.494316


In [85]:
scores.mean()


fit_time                        0.000493
score_time                      0.000367
test_neg_mean_squared_error    -3.323018
test_neg_mean_absolute_error   -1.308467
dtype: float64

In [86]:
model.fit(X_train_scaled, y_train)

Ridge(alpha=1)

In [87]:
y_final_pred = model.predict(X_test_scaled)


In [89]:
mse = mean_squared_error(y_test, y_final_pred)
mse

2.3190215794287514