In assignment 1, you created a linear regression model for the cereal dataset. Create another linear regression after removing two variables (sodium and sugars). Compare the AIC, BIC, and Adjusted R-squared variables between the full model and the model with two less variables. Which one is a better model based on those model goodness measures? (split into 20/80% for test/train )

In [1]:

import numpy as np
import warnings
import sklearn.linear_model as lm
import sklearn.model_selection as ms

from sklearn.linear_model import LinearRegression
import pandas as pd
import statsmodels.api as sm
warnings.filterwarnings('ignore')


In [2]:
cereal = pd.read_csv("cereal.csv",sep=";")
cereal = cereal.drop(0)
cereal.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
1,100% Bran,N,C,70,4,1,130,10,5,6,280,25,3,1,0.33,68.402973
2,100% Natural Bran,Q,C,120,3,5,15,2,8,8,135,0,3,1,1.0,33.983679
3,All-Bran,K,C,70,4,1,260,9,7,5,320,25,3,1,0.33,59.425505
4,All-Bran with Extra Fiber,K,C,50,4,0,140,14,8,0,330,25,3,1,0.5,93.704912
5,Almond Delight,R,C,110,2,2,200,1,14,8,-1,25,3,1,0.75,34.384843


In [3]:
new_col = ['type']
def binary_map(a):
    return a.map({'H': 1, "C": 0})
cereal[new_col] = cereal[new_col].apply(binary_map)
cereal

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
1,100% Bran,N,0,70,4,1,130,10,5,6,280,25,3,1,0.33,68.402973
2,100% Natural Bran,Q,0,120,3,5,15,2,8,8,135,0,3,1,1,33.983679
3,All-Bran,K,0,70,4,1,260,9,7,5,320,25,3,1,0.33,59.425505
4,All-Bran with Extra Fiber,K,0,50,4,0,140,14,8,0,330,25,3,1,0.5,93.704912
5,Almond Delight,R,0,110,2,2,200,1,14,8,-1,25,3,1,0.75,34.384843
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,Triples,G,0,110,2,1,250,0,21,3,60,25,3,1,0.75,39.106174
74,Trix,G,0,110,1,1,140,0,13,12,25,25,2,1,1,27.753301
75,Wheat Chex,R,0,100,3,1,230,3,17,3,115,25,1,1,0.67,49.787445
76,Wheaties,G,0,100,3,1,200,3,17,3,110,25,1,1,1,51.592193


In [4]:
# Dropping the name and mfr column and convertin values to numbers
dumm_data = pd.get_dummies(cereal['mfr'])
cereal = pd.concat([cereal, dumm_data], axis = 1)
cereal.drop(['mfr','name'],axis = 1,inplace=True) 
cereal

Unnamed: 0,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,...,weight,cups,rating,A,G,K,N,P,Q,R
1,0,70,4,1,130,10,5,6,280,25,...,1,0.33,68.402973,0,0,0,1,0,0,0
2,0,120,3,5,15,2,8,8,135,0,...,1,1,33.983679,0,0,0,0,0,1,0
3,0,70,4,1,260,9,7,5,320,25,...,1,0.33,59.425505,0,0,1,0,0,0,0
4,0,50,4,0,140,14,8,0,330,25,...,1,0.5,93.704912,0,0,1,0,0,0,0
5,0,110,2,2,200,1,14,8,-1,25,...,1,0.75,34.384843,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,0,110,2,1,250,0,21,3,60,25,...,1,0.75,39.106174,0,1,0,0,0,0,0
74,0,110,1,1,140,0,13,12,25,25,...,1,1,27.753301,0,1,0,0,0,0,0
75,0,100,3,1,230,3,17,3,115,25,...,1,0.67,49.787445,0,0,0,0,0,0,1
76,0,100,3,1,200,3,17,3,110,25,...,1,1,51.592193,0,1,0,0,0,0,0


In [5]:
# testing and training data
from sklearn.model_selection import train_test_split
np.random.seed(0)
d_train, d_test = train_test_split(cereal, train_size = 0.8, test_size = 0.2, random_state = 200)
cereal

Unnamed: 0,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,...,weight,cups,rating,A,G,K,N,P,Q,R
1,0,70,4,1,130,10,5,6,280,25,...,1,0.33,68.402973,0,0,0,1,0,0,0
2,0,120,3,5,15,2,8,8,135,0,...,1,1,33.983679,0,0,0,0,0,1,0
3,0,70,4,1,260,9,7,5,320,25,...,1,0.33,59.425505,0,0,1,0,0,0,0
4,0,50,4,0,140,14,8,0,330,25,...,1,0.5,93.704912,0,0,1,0,0,0,0
5,0,110,2,2,200,1,14,8,-1,25,...,1,0.75,34.384843,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,0,110,2,1,250,0,21,3,60,25,...,1,0.75,39.106174,0,1,0,0,0,0,0
74,0,110,1,1,140,0,13,12,25,25,...,1,1,27.753301,0,1,0,0,0,0,0
75,0,100,3,1,230,3,17,3,115,25,...,1,0.67,49.787445,0,0,0,0,0,0,1
76,0,100,3,1,200,3,17,3,110,25,...,1,1,51.592193,0,1,0,0,0,0,0


In [6]:
#Scaling the data
#define the columns
col = ['type','calories','protein','fat','sodium','fiber','carbo','sugars','potass','vitamins','shelf','weight','cups','rating','A','G','K','N','P','Q','R']

#Call the sklearn library and import scaler values 
from sklearn.preprocessing import StandardScaler

#call the standard scaler 
scaler = StandardScaler()

#fit the values to the function 
d_train[col] = scaler.fit_transform(d_train[col])

In [7]:
X_train = d_train
y_train = d_train.pop('rating')

In [8]:
# Predicting the values

log_Regression = LinearRegression()
log_Regression.fit(X_train, y_train)

X_train = d_train[['calories','protein','fat','sodium','fiber','carbo','sugars','potass','vitamins','cups']]

In [9]:
#AIC, BIC, and Adjusted R-squared variables
import statsmodels.api as sm
X_train_lm = sm.add_constant(X_train)
lr_1 = sm.OLS(y_train.astype(float), X_train_lm.astype(float)).fit()
print(lr_1.summary())


                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 1.384e+16
Date:                Fri, 28 Oct 2022   Prob (F-statistic):               0.00
Time:                        16:53:52   Log-Likelihood:                 997.93
No. Observations:                  61   AIC:                            -1974.
Df Residuals:                      50   BIC:                            -1951.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       3.851e-16   2.69e-09   1.43e-07      1.0

In [10]:
#AIC and BIC for Full Model
print(lr_1.aic)
print(lr_1.bic)

-1973.8619508426495
-1950.642338336743


In [11]:
#updated model with sodium and sugar dropped
X_updated = d_train[['calories','protein','fat','fiber','carbo','potass','vitamins','cups']]

# AIC, BIC, and Adjusted R-squared variables
import statsmodels.api as sm
X_modified = sm.add_constant(X_updated)
lr_2 = sm.OLS(y_train, X_modified).fit()
print(lr_2.summary())


                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.920
Model:                            OLS   Adj. R-squared:                  0.907
Method:                 Least Squares   F-statistic:                     74.30
Date:                Fri, 28 Oct 2022   Prob (F-statistic):           1.00e-25
Time:                        16:53:52   Log-Likelihood:                -9.6882
No. Observations:                  61   AIC:                             37.38
Df Residuals:                      52   BIC:                             56.37
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       3.851e-16      0.039   9.79e-15      1.0

In [12]:
#AIC and BIC for Updated Model
print(lr_2.aic)
print(lr_2.bic)

37.37630066004064
56.374165437600446
