In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('Advertising.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [7]:
###Train and test split
X = df.drop('Sales',axis = 1)
y = df['Sales']

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=10)

In [9]:
## scale data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
## create a model

from sklearn.linear_model import Ridge

In [11]:
model = Ridge(alpha=100) ### poor alpha chosen

In [12]:
model.fit(X_train, y_train)

Ridge(alpha=100)

In [13]:
y_pred = model.predict(X_test)

In [14]:
## Evaluation
from sklearn.metrics import mean_squared_error

In [15]:
mean_squared_error(y_test, y_pred)

11.925501113060864

In [16]:
## Adjust parameter and re-evaluate
model = Ridge(alpha=1)

In [17]:
model.fit(X_train, y_train)

Ridge(alpha=1)

In [18]:
y_pred = model.predict(X_test)

In [19]:
mean_squared_error(y_test, y_pred)

5.415006934581305

### Train - validation - test split

This is often also called a "hold-out" set, since you should not adjust parameters based on the final test set, but instead use it *only* for reporting final expected performance.

0. Clean and adjust data as necessary for X and y
1. Split Data in Train/Validation/Test for both X and y
2. Fit/Train Scaler on Training X Data
3. Scale X Eval Data
4. Create Model
5. Fit/Train Model on X Train Data
6. Evaluate Model on X Evaluation Data (by creating predictions and comparing to Y_eval)
7. Adjust Parameters as Necessary and repeat steps 5 and 6
8. Get final metrics on Test set (not allowed to go back and adjust after this!)


In [21]:
### Create X and y
X = df.drop('Sales',axis = 1)
y = df['Sales']

In [22]:
X_train, X_other, y_train, y_other = train_test_split(X, y, test_size=0.30, random_state=10)

In [23]:
X_eval, X_test, y_eval, y_test = train_test_split(X_other, y_other, test_size=0.50, random_state=10)

In [24]:
## Scale DATA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_eval = scaler.transform(X_eval)
X_test = scaler.transform(X_test)

In [25]:
# create a model
from sklearn.linear_model import Ridge

In [26]:
model = Ridge(alpha=100) ### poor alpha chosen

In [27]:
model.fit(X_train, y_train)

Ridge(alpha=100)

In [28]:
y_pred_eval = model.predict(X_eval)

In [29]:
mean_squared_error(y_eval, y_pred_eval)

8.747496133417815

In [30]:
## Adjust the parameters
model = Ridge(alpha=1)

In [31]:
model.fit(X_train, y_train)

Ridge(alpha=1)

In [32]:
y_pred_eval = model.predict(X_eval)

In [33]:
mean_squared_error(y_eval, y_pred_eval)

2.988583079572906

In [34]:
## Final evaluation
y_test_pred = model.predict(X_test)

In [35]:
mean_squared_error(y_test, y_test_pred)

7.841430789589703

In [36]:
### use cross_val_score

In [37]:
from sklearn.model_selection import cross_val_score

In [40]:
X = df.drop('Sales',axis = 1)
y = df['Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=10)
## scale data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [41]:
model = Ridge(alpha = 100)

In [42]:
##scoring
scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)

In [43]:
scores

array([-4.06627125, -6.24693932, -9.07912275, -6.77483825, -6.31389372])

In [44]:
abs(scores.mean())

6.496213058160796

In [45]:
## adjust the model based on metrics
model = Ridge(alpha = 1)

In [46]:
scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)

In [47]:
abs(scores.mean())

1.926030769981343

In [48]:
## final evaluation 
model.fit(X_train, y_train)

Ridge(alpha=1)

In [49]:
y_test_pred = model.predict(X_test)

In [50]:
mean_squared_error(y_test, y_test_pred)

5.415006934581305