In [1]:
import pandas as pd
import statsmodels.api as sm
import numpy as np

In [2]:
df = pd.read_csv("../Carseats.csv")
df.head(3)

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes


In [3]:
# turn object columns into enumerated type
df["Urban"] = pd.factorize(df.Urban)[0]
df["US"] = pd.factorize(df.US)[0]
df.head(3)

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,0,0
1,11.22,111,48,16,260,83,Good,65,10,0,0
2,10.06,113,35,10,269,80,Medium,59,12,0,0


In [4]:
X = df[["Price", "Urban", "US"]]
y = df["Sales"]

In [5]:
# add the constant term
X = sm.add_constant(X)

In [6]:
# regress and fit model
model = sm.OLS(y, X).fit()

In [7]:
model.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.239
Model:,OLS,Adj. R-squared:,0.234
Method:,Least Squares,F-statistic:,41.52
Date:,"Mon, 02 Jan 2023",Prob (F-statistic):,2.39e-23
Time:,21:15:19,Log-Likelihood:,-927.66
No. Observations:,400,AIC:,1863.0
Df Residuals:,396,BIC:,1879.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,14.2221,0.639,22.253,0.000,12.966,15.479
Price,-0.0545,0.005,-10.389,0.000,-0.065,-0.044
Urban,0.0219,0.272,0.081,0.936,-0.512,0.556
US,-1.2006,0.259,-4.635,0.000,-1.710,-0.691

0,1,2,3
Omnibus:,0.676,Durbin-Watson:,1.912
Prob(Omnibus):,0.713,Jarque-Bera (JB):,0.758
Skew:,0.093,Prob(JB):,0.684
Kurtosis:,2.897,Cond. No.,615.0


In [8]:
# bi. The higher the price, the less the sales. Cars made in US, sales were less. the higher the urban, the higher the sales (although Urban is not satistically significant)

# c. sales = 14.2221 - 0.0545Price + 0.0219Urban - 1.2006US

# d. Can reject null hypothesis for Price and US

In [9]:
# reassigning columns to X based on first model run
X_2 = df[["Price", "US"]]

In [10]:
# add the constant term
X_2 = sm.add_constant(X_2)

In [11]:
# refit model
model_2 = sm.OLS(y, X_2).fit()

In [12]:
# print new summary
model_2.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.239
Model:,OLS,Adj. R-squared:,0.235
Method:,Least Squares,F-statistic:,62.43
Date:,"Mon, 02 Jan 2023",Prob (F-statistic):,2.6599999999999998e-24
Time:,21:16:22,Log-Likelihood:,-927.66
No. Observations:,400,AIC:,1861.0
Df Residuals:,397,BIC:,1873.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,14.2304,0.630,22.589,0.000,12.992,15.469
Price,-0.0545,0.005,-10.416,0.000,-0.065,-0.044
US,-1.1996,0.258,-4.641,0.000,-1.708,-0.692

0,1,2,3
Omnibus:,0.666,Durbin-Watson:,1.912
Prob(Omnibus):,0.717,Jarque-Bera (JB):,0.749
Skew:,0.092,Prob(JB):,0.688
Kurtosis:,2.895,Cond. No.,605.0


In [None]:
# e. the r-squared does not improve, meaning the variability in response explained by model did not improve with 1 less predictor