In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

## Part 1 - Regress the Model

In [2]:
kelleydata = pd.read_csv('KelleyBlueBookData.csv')
kelleydata.sample(5)

Unnamed: 0,Price,Mileage,Make,Model,Trim,Type,Cylinder,Liter,Doors,Cruise,Sound,Leather
383,24809.04232,16111,Chevrolet,Impala,SS Sedan 4D,Sedan,6,3.8,4,1,0,0
596,12465.50852,23931,Pontiac,Sunfire,Coupe 2D,Coupe,4,2.2,2,0,1,1
636,33287.4096,21661,SAAB,9_3,Linear Conv 2D,Convertible,4,2.0,2,1,0,1
571,19567.25929,2189,Pontiac,Grand Prix,Sedan 4D,Sedan,6,3.8,4,1,1,1
25,21460.01395,19467,Buick,Lacrosse,CXL Sedan 4D,Sedan,6,3.6,4,1,0,1


In [3]:
reg = smf.ols('Price ~ Mileage + Type + Cylinder + Liter + Cruise + Sound + Leather', data = kelleydata).fit()
reg.summary()

0,1,2,3
Dep. Variable:,Price,R-squared:,0.686
Model:,OLS,Adj. R-squared:,0.682
Method:,Least Squares,F-statistic:,173.3
Date:,"Tue, 05 Oct 2021",Prob (F-statistic):,7.03e-192
Time:,20:22:33,Log-Likelihood:,-8070.3
No. Observations:,804,AIC:,16160.0
Df Residuals:,793,BIC:,16210.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.861e+04,1525.610,12.201,0.000,1.56e+04,2.16e+04
Type[T.Coupe],-2.056e+04,939.771,-21.880,0.000,-2.24e+04,-1.87e+04
Type[T.Hatchback],-2.245e+04,1129.875,-19.869,0.000,-2.47e+04,-2.02e+04
Type[T.Sedan],-1.844e+04,835.136,-22.083,0.000,-2.01e+04,-1.68e+04
Type[T.Wagon],-1.098e+04,1085.229,-10.119,0.000,-1.31e+04,-8851.704
Mileage,-0.1857,0.024,-7.722,0.000,-0.233,-0.139
Cylinder,3455.4084,535.458,6.453,0.000,2404.326,4506.491
Liter,213.1471,674.671,0.316,0.752,-1111.206,1537.500
Cruise,3952.0116,514.256,7.685,0.000,2942.548,4961.476

0,1,2,3
Omnibus:,73.056,Durbin-Watson:,0.308
Prob(Omnibus):,0.0,Jarque-Bera (JB):,91.006
Skew:,0.804,Prob(JB):,1.73e-20
Kurtosis:,3.358,Cond. No.,224000.0


## Part 2 - Interpret Regression Coefficient using T-test

#### The dependent variable Y has a linear relationship to the independent variable X.

#### If there is a significant linear relationship between the independent variable X and the dependent variable Y, the slope will not equal zero.

#### Ho: Β1 = 0

#### Ha: Β1 ≠ 0

## Part 3 - Categorical Variable

In [4]:
kelleydata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 804 entries, 0 to 803
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Price     804 non-null    float64
 1   Mileage   804 non-null    int64  
 2   Make      804 non-null    object 
 3   Model     804 non-null    object 
 4   Trim      804 non-null    object 
 5   Type      804 non-null    object 
 6   Cylinder  804 non-null    int64  
 7   Liter     804 non-null    float64
 8   Doors     804 non-null    int64  
 9   Cruise    804 non-null    int64  
 10  Sound     804 non-null    int64  
 11  Leather   804 non-null    int64  
dtypes: float64(2), int64(6), object(4)
memory usage: 75.5+ KB


In [5]:
kelleydata['Cylinder'].value_counts()

4    394
6    310
8    100
Name: Cylinder, dtype: int64

In [6]:
reg2 = smf.ols('Price ~ Mileage + Type + C(Cylinder) + Liter + Cruise + Sound + Leather', data = kelleydata).fit()
reg2.summary()

0,1,2,3
Dep. Variable:,Price,R-squared:,0.738
Model:,OLS,Adj. R-squared:,0.734
Method:,Least Squares,F-statistic:,202.6
Date:,"Tue, 05 Oct 2021",Prob (F-statistic):,1.51e-221
Time:,20:22:33,Log-Likelihood:,-7998.0
No. Observations:,804,AIC:,16020.0
Df Residuals:,792,BIC:,16080.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.943e+04,1608.357,18.300,0.000,2.63e+04,3.26e+04
Type[T.Coupe],-1.857e+04,874.134,-21.244,0.000,-2.03e+04,-1.69e+04
Type[T.Hatchback],-1.831e+04,1085.146,-16.873,0.000,-2.04e+04,-1.62e+04
Type[T.Sedan],-1.547e+04,799.994,-19.336,0.000,-1.7e+04,-1.39e+04
Type[T.Wagon],-9452.1225,1000.020,-9.452,0.000,-1.14e+04,-7489.120
C(Cylinder)[T.6],1360.1311,1075.453,1.265,0.206,-750.943,3471.205
C(Cylinder)[T.8],1.416e+04,1959.004,7.231,0.000,1.03e+04,1.8e+04
Mileage,-0.1871,0.022,-8.505,0.000,-0.230,-0.144
Liter,1115.8414,621.236,1.796,0.073,-103.622,2335.305

0,1,2,3
Omnibus:,50.373,Durbin-Watson:,0.341
Prob(Omnibus):,0.0,Jarque-Bera (JB):,68.521
Skew:,0.531,Prob(JB):,1.32e-15
Kurtosis:,3.958,Cond. No.,317000.0


## Part 4 - Partial ANOVA Test

In [7]:
# Using statsmodels.api package

In [8]:
sm.stats.anova_lm(reg2, typ = 2)

Unnamed: 0,sum_sq,df,F,PR(>F)
Type,13813330000.0,4.0,132.939757,7.297205e-87
C(Cylinder),5348559000.0,2.0,102.949252,1.807044e-40
Mileage,1879060000.0,1.0,72.336414,8.996777e-17
Liter,83806060.0,1.0,3.226204,0.07284949
Cruise,2504757000.0,1.0,96.423312,1.485766e-21
Sound,34766.17,1.0,0.001338,0.9708262
Leather,389684400.0,1.0,15.001317,0.0001163062
Residual,20573530000.0,792.0,,
