# Assignment - Model testing and validation
**Objectives:**
- Split the data into test and training sets
- Use cross validation to fit the model and report each validation score
- Fit the model on all of the training data and score it on the dataset

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
from sklearn.model_selection import train_test_split

In [None]:
computers = pd.read_csv('../Course Materials/Data/Computers.csv')
computers.head()

In [14]:
computers.head()

Unnamed: 0,price,speed,hd,ram,screen,cd,multi,premium,ads,trend
0,1499,25,80,4,14,no,no,yes,94,1
1,1795,33,85,2,14,no,no,yes,94,1
2,1595,25,170,4,15,no,no,yes,94,1
3,1849,25,170,8,14,no,no,no,94,1
4,3295,33,340,16,14,no,no,yes,94,1


In [3]:
features =['speed', 'hd', 'ram', 'screen', 'ads', 'trend']

In [4]:
X = sm.add_constant(computers[features])
y = computers['price']

In [6]:
model = sm.OLS(y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.712
Model:,OLS,Adj. R-squared:,0.712
Method:,Least Squares,F-statistic:,2580.0
Date:,"Sat, 20 Dec 2025",Prob (F-statistic):,0.0
Time:,19:12:53,Log-Likelihood:,-44817.0
No. Observations:,6259,AIC:,89650.0
Df Residuals:,6252,BIC:,89690.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-246.6755,66.371,-3.717,0.000,-376.785,-116.566
speed,8.8939,0.209,42.590,0.000,8.485,9.303
hd,0.7088,0.031,22.932,0.000,0.648,0.769
ram,47.3870,1.188,39.899,0.000,45.059,49.715
screen,126.7024,4.521,28.022,0.000,117.839,135.566
ads,0.9697,0.057,17.099,0.000,0.859,1.081
trend,-47.0820,0.676,-69.660,0.000,-48.407,-45.757

0,1,2,3
Omnibus:,1407.073,Durbin-Watson:,1.948
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3902.981
Skew:,1.187,Prob(JB):,0.0
Kurtosis:,6.054,Cond. No.,8890.0


In [7]:
X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.25, random_state=21)

In [9]:
model_2 = sm.OLS(y_train, X_train).fit()
model_2.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.711
Model:,OLS,Adj. R-squared:,0.71
Method:,Least Squares,F-statistic:,1534.0
Date:,"Sat, 20 Dec 2025",Prob (F-statistic):,0.0
Time:,19:19:17,Log-Likelihood:,-26892.0
No. Observations:,3755,AIC:,53800.0
Df Residuals:,3748,BIC:,53840.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-265.2996,85.465,-3.104,0.002,-432.862,-97.737
speed,8.8904,0.268,33.134,0.000,8.364,9.416
hd,0.6877,0.041,16.634,0.000,0.607,0.769
ram,47.4566,1.556,30.502,0.000,44.406,50.507
screen,127.2993,5.823,21.862,0.000,115.883,138.716
ads,0.9904,0.074,13.423,0.000,0.846,1.135
trend,-46.3241,0.892,-51.914,0.000,-48.074,-44.575

0,1,2,3
Omnibus:,888.257,Durbin-Watson:,2.054
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2604.703
Skew:,1.22,Prob(JB):,0.0
Kurtosis:,6.27,Cond. No.,8850.0


In [10]:
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score as r2

In [13]:
print(f'MAE: {mae(y_train, model_2.predict(X_train))}')
print(f'R2: {r2(y_train, model_2.predict(X_train))}')
print(f'Validation MAE: {mae(y_valid, model_2.predict(X_valid))}')
print(f'Validation r2: {r2(y_valid, model_2.predict(X_valid))}')

MAE: 224.51531622456815
R2: 0.7105764167176969
Validation MAE: 229.56652547754703
Validation r2: 0.7136709451996717


### Cross validation

In [15]:
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score as r2

In [16]:
kf = KFold(n_splits=5, shuffle=True, random_state=21)
cv_lm_r2s = []
cv_lm_mae = []


In [17]:
# Loop through each fold in X and y
for train_ind, val_ind in kf.split(X, y):
    # Subset data based on CV folds
    X_train, y_train = X.iloc[train_ind], y.iloc[train_ind]
    X_val, y_val = X.iloc[val_ind], y.iloc[val_ind]
    # Fit the Model on fold's training data
    model = sm.OLS(y_train, X_train).fit()
    # Append Validation score to list 
    cv_lm_r2s.append(r2(y_val, model.predict(X_val),))
    cv_lm_mae.append(mae(y_val, model.predict(X_val),))

print("All Validation R2s: ", [round(x, 3) for x in cv_lm_r2s])
print(f"Cross Val R2s: {round(np.mean(cv_lm_r2s), 3)} +- {round(np.std(cv_lm_r2s), 3)}")

print("All Validation MAEs: ", [round(x, 3) for x in cv_lm_mae])
print(f"Cross Val MAEs: {round(np.mean(cv_lm_mae), 3)} +- {round(np.std(cv_lm_mae), 3)}")

All Validation R2s:  [0.716, 0.71, 0.705, 0.734, 0.689]
Cross Val R2s: 0.711 +- 0.015
All Validation MAEs:  [229.198, 227.43, 224.511, 219.912, 233.739]
Cross Val MAEs: 226.958 +- 4.623
