## Encoding 

In [17]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [18]:
data = pd.read_csv("./auto-mpg.data", sep="\s+" , header=None )
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0,chevrolet chevelle malibu
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,1.0,buick skylark 320
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,1.0,plymouth satellite
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,1.0,amc rebel sst
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,1.0,ford torino


In [19]:
new_columns =  ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin', 'car name']
data.columns = new_columns
data.tail()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
401,27.0,4.0,140.0,86.0,2790.0,15.6,82.0,1.0,ford mustang gl
402,44.0,4.0,97.0,52.0,2130.0,24.6,82.0,2.0,vw pickup
403,32.0,4.0,135.0,84.0,2295.0,11.6,82.0,1.0,dodge rampage
404,28.0,4.0,120.0,79.0,2625.0,18.6,82.0,1.0,ford ranger
405,31.0,4.0,119.0,82.0,2720.0,19.4,82.0,1.0,chevy s-10


In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406 entries, 0 to 405
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     406 non-null    float64
 2   displacement  406 non-null    float64
 3   horsepower    400 non-null    float64
 4   weight        406 non-null    float64
 5   acceleration  406 non-null    float64
 6   model year    406 non-null    float64
 7   origin        406 non-null    float64
 8   car name      406 non-null    object 
dtypes: float64(8), object(1)
memory usage: 28.7+ KB


In [21]:
data['origin'].value_counts()

1.0    254
3.0     79
2.0     73
Name: origin, dtype: int64

In [22]:
# Pandas to get dummies
data_dummies = pd.get_dummies(data, columns=['origin'], prefix='origin', dtype=int)
data_dummies

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,car name,origin_1.0,origin_2.0,origin_3.0
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,chevrolet chevelle malibu,1,0,0
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,buick skylark 320,1,0,0
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,plymouth satellite,1,0,0
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,amc rebel sst,1,0,0
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,ford torino,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
401,27.0,4.0,140.0,86.0,2790.0,15.6,82.0,ford mustang gl,1,0,0
402,44.0,4.0,97.0,52.0,2130.0,24.6,82.0,vw pickup,0,1,0
403,32.0,4.0,135.0,84.0,2295.0,11.6,82.0,dodge rampage,1,0,0
404,28.0,4.0,120.0,79.0,2625.0,18.6,82.0,ford ranger,1,0,0


In [23]:
# We need to drop one to create a baseline to act as reference and so that the model can learn something new 
data = pd.get_dummies(data, columns=['origin'], drop_first=True, dtype=int)


In [24]:
# We have now broken the perfect collinearity by dropping one
data

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,car name,origin_2.0,origin_3.0
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,chevrolet chevelle malibu,0,0
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,buick skylark 320,0,0
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,plymouth satellite,0,0
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,amc rebel sst,0,0
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,ford torino,0,0
...,...,...,...,...,...,...,...,...,...,...
401,27.0,4.0,140.0,86.0,2790.0,15.6,82.0,ford mustang gl,0,0
402,44.0,4.0,97.0,52.0,2130.0,24.6,82.0,vw pickup,1,0
403,32.0,4.0,135.0,84.0,2295.0,11.6,82.0,dodge rampage,0,0
404,28.0,4.0,120.0,79.0,2625.0,18.6,82.0,ford ranger,0,0


In [None]:
# Dropped column acts as a baseline or a reference point

In [25]:
data.isna().sum()

mpg             8
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model year      0
car name        0
origin_2.0      0
origin_3.0      0
dtype: int64

In [26]:
data = data.dropna()
data

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,car name,origin_2.0,origin_3.0
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,chevrolet chevelle malibu,0,0
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,buick skylark 320,0,0
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,plymouth satellite,0,0
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,amc rebel sst,0,0
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,ford torino,0,0
...,...,...,...,...,...,...,...,...,...,...
401,27.0,4.0,140.0,86.0,2790.0,15.6,82.0,ford mustang gl,0,0
402,44.0,4.0,97.0,52.0,2130.0,24.6,82.0,vw pickup,1,0
403,32.0,4.0,135.0,84.0,2295.0,11.6,82.0,dodge rampage,0,0
404,28.0,4.0,120.0,79.0,2625.0,18.6,82.0,ford ranger,0,0


In [27]:
y = data['mpg']
x1 = data[['weight']]
x1 = sm.add_constant(x1)
model1 = sm.OLS(y,x1).fit()
model1.summary()


0,1,2,3
Dep. Variable:,mpg,R-squared:,0.693
Model:,OLS,Adj. R-squared:,0.692
Method:,Least Squares,F-statistic:,878.8
Date:,"Wed, 26 Nov 2025",Prob (F-statistic):,6.02e-102
Time:,12:58:37,Log-Likelihood:,-1130.0
No. Observations:,392,AIC:,2264.0
Df Residuals:,390,BIC:,2272.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,46.2165,0.799,57.867,0.000,44.646,47.787
weight,-0.0076,0.000,-29.645,0.000,-0.008,-0.007

0,1,2,3
Omnibus:,41.682,Durbin-Watson:,0.808
Prob(Omnibus):,0.0,Jarque-Bera (JB):,60.039
Skew:,0.727,Prob(JB):,9.18e-14
Kurtosis:,4.251,Cond. No.,11300.0


Weights affect 69% of the miles per gallon for a vehicle
the p value is less than 0.05 showing that weight is statistically significant 
The heavier the vehicle the lower the mpg

In [31]:
y = data['mpg']
x2 = data[['weight', 'model year', 'origin_2.0', 'origin_3.0']]
x2 = sm.add_constant(x2)
model2 = sm.OLS(y,x2).fit()
model2.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.819
Model:,OLS,Adj. R-squared:,0.817
Method:,Least Squares,F-statistic:,437.9
Date:,"Wed, 26 Nov 2025",Prob (F-statistic):,3.53e-142
Time:,13:05:06,Log-Likelihood:,-1026.1
No. Observations:,392,AIC:,2062.0
Df Residuals:,387,BIC:,2082.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-18.3069,4.017,-4.557,0.000,-26.205,-10.409
weight,-0.0059,0.000,-22.647,0.000,-0.006,-0.005
model year,0.7698,0.049,15.818,0.000,0.674,0.866
origin_2.0,1.9763,0.518,3.815,0.000,0.958,2.995
origin_3.0,2.2145,0.519,4.268,0.000,1.194,3.235

0,1,2,3
Omnibus:,32.293,Durbin-Watson:,1.251
Prob(Omnibus):,0.0,Jarque-Bera (JB):,58.234
Skew:,0.507,Prob(JB):,2.26e-13
Kurtosis:,4.593,Cond. No.,73900.0
