In [1]:
import pandas as pd
import statsmodels.api as sm

In [2]:
# read csv into dataframe
df = pd.read_csv("../Carseats.csv")
df.head(3)

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes


In [3]:
# check data to ensure every column is a number
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    int64  
 6   ShelveLoc    400 non-null    object 
 7   Age          400 non-null    int64  
 8   Education    400 non-null    int64  
 9   Urban        400 non-null    object 
 10  US           400 non-null    object 
dtypes: float64(1), int64(7), object(3)
memory usage: 34.5+ KB


In [4]:
# change qualitative predictors into quantitave forms
df["ShelveLoc"] = pd.factorize(df.ShelveLoc)[0]
df["Urban"] = pd.factorize(df.Urban)[0]
df["US"] = pd.factorize(df.US)[0]
df.head(3)

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,0,42,17,0,0
1,11.22,111,48,16,260,83,1,65,10,0,0
2,10.06,113,35,10,269,80,2,59,12,0,0


In [5]:
# recheck all the columns are all quantitative
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    int64  
 6   ShelveLoc    400 non-null    int64  
 7   Age          400 non-null    int64  
 8   Education    400 non-null    int64  
 9   Urban        400 non-null    int64  
 10  US           400 non-null    int64  
dtypes: float64(1), int64(10)
memory usage: 34.5 KB


In [6]:
# add the interactive columns
df["income:advert"] = df["Income"] * df["Advertising"]
df["price:age"] = df["Price"] * df["Age"]
df.head(3)

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US,income:advert,price:age
0,9.5,138,73,11,276,120,0,42,17,0,0,803,5040
1,11.22,111,48,16,260,83,1,65,10,0,0,768,5395
2,10.06,113,35,10,269,80,2,59,12,0,0,350,4720


In [7]:
# assign the columns to be used for regression to predictor/response variables
X = df.drop("Sales", axis=1)
y = df["Sales"]

In [8]:
X = sm.add_constant(X)

In [9]:
model = sm.OLS(y, X).fit()

In [10]:
# print summary 
model.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.578
Model:,OLS,Adj. R-squared:,0.565
Method:,Least Squares,F-statistic:,44.24
Date:,"Sat, 31 Dec 2022",Prob (F-statistic):,4.26e-65
Time:,20:26:15,Log-Likelihood:,-809.62
No. Observations:,400,AIC:,1645.0
Df Residuals:,387,BIC:,1697.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,9.7990,1.872,5.235,0.000,6.119,13.479
CompPrice,0.0931,0.008,12.276,0.000,0.078,0.108
Income,0.0112,0.005,2.339,0.020,0.002,0.021
Advertising,0.0964,0.042,2.315,0.021,0.015,0.178
Population,-3.667e-05,0.001,-0.054,0.957,-0.001,0.001
Price,-0.1152,0.014,-8.422,0.000,-0.142,-0.088
ShelveLoc,0.5880,0.113,5.183,0.000,0.365,0.811
Age,-0.0972,0.029,-3.317,0.001,-0.155,-0.040
Education,-0.0350,0.036,-0.969,0.333,-0.106,0.036

0,1,2,3
Omnibus:,31.937,Durbin-Watson:,1.984
Prob(Omnibus):,0.0,Jarque-Bera (JB):,37.994
Skew:,0.748,Prob(JB):,5.62e-09
Kurtosis:,2.79,Cond. No.,131000.0
