AUTO MPG MODEL USING LINEAR REGRESSION

In [54]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


Importing data

In [55]:
cars = pd.read_csv("auto-mpg.csv")

In [56]:
cars.head(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
5,15.0,8,429.0,198,4341,10.0,70,1,ford galaxie 500
6,14.0,8,454.0,220,4354,9.0,70,1,chevrolet impala
7,14.0,8,440.0,215,4312,8.5,70,1,plymouth fury iii
8,14.0,8,455.0,225,4425,10.0,70,1,pontiac catalina
9,15.0,8,390.0,190,3850,8.5,70,1,amc ambassador dpl


In [57]:
cars.tail(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
388,26.0,4,156.0,92,2585,14.5,82,1,chrysler lebaron medallion
389,22.0,6,232.0,112,2835,14.7,82,1,ford granada l
390,32.0,4,144.0,96,2665,13.9,82,3,toyota celica gt
391,36.0,4,135.0,84,2370,13.0,82,1,dodge charger 2.2
392,27.0,4,151.0,90,2950,17.3,82,1,chevrolet camaro
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger
397,31.0,4,119.0,82,2720,19.4,82,1,chevy s-10


In [58]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


Preprocessing

converting car name into maker name 'make' and fixing the typo's

In [59]:
def preprocess_inputs(df):
    df = df.copy()
    
    #creating feature : make
    df['make'] = df['car name'].apply(lambda x: re.search(r'^\w+',x).group(0))
    df = df.drop('car name',axis=1)
    
    #fixing typo in make names
    make_typo_correction = {
        'vw' :         'volkswagen',
        'chevroelt ' : 'chevrolet'  ,
        'chevy':       'chevrolet' ,
        'maxda' :      'mazda',
        'vokswagen' :  'volkswagen' ,
        'toyouta' :    'toyota' ,
        
       
    }
    
    df['make'] =  df['make'].replace(make_typo_correction)
    
    
    
    
    return df

In [60]:
car = preprocess_inputs(cars)


In [61]:
car

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,make
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet
1,15.0,8,350.0,165,3693,11.5,70,1,buick
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth
3,16.0,8,304.0,150,3433,12.0,70,1,amc
4,17.0,8,302.0,140,3449,10.5,70,1,ford
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford
394,44.0,4,97.0,52,2130,24.6,82,2,volkswagen
395,32.0,4,135.0,84,2295,11.6,82,1,dodge
396,28.0,4,120.0,79,2625,18.6,82,1,ford


In [62]:
car['make'].value_counts()

make
ford          51
chevrolet     46
plymouth      31
dodge         28
amc           28
toyota        26
datsun        23
volkswagen    22
buick         17
pontiac       16
honda         13
mazda         12
mercury       11
oldsmobile    10
peugeot        8
fiat           8
audi           7
chrysler       6
volvo          6
renault        5
opel           4
saab           4
subaru         4
mercedes       3
bmw            2
cadillac       2
hi             1
capri          1
chevroelt      1
triumph        1
nissan         1
Name: count, dtype: int64

One hot encoding

In [63]:
cars_dummy = pd.get_dummies(car['make'])
cars_dummy

Unnamed: 0,amc,audi,bmw,buick,cadillac,capri,chevroelt,chevrolet,chrysler,datsun,...,peugeot,plymouth,pontiac,renault,saab,subaru,toyota,triumph,volkswagen,volvo
0,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
394,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
395,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
396,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [64]:
car = pd.concat([car,cars_dummy],axis =1)
car

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,make,amc,...,peugeot,plymouth,pontiac,renault,saab,subaru,toyota,triumph,volkswagen,volvo
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet,False,...,False,False,False,False,False,False,False,False,False,False
1,15.0,8,350.0,165,3693,11.5,70,1,buick,False,...,False,False,False,False,False,False,False,False,False,False
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth,False,...,False,True,False,False,False,False,False,False,False,False
3,16.0,8,304.0,150,3433,12.0,70,1,amc,True,...,False,False,False,False,False,False,False,False,False,False
4,17.0,8,302.0,140,3449,10.5,70,1,ford,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford,False,...,False,False,False,False,False,False,False,False,False,False
394,44.0,4,97.0,52,2130,24.6,82,2,volkswagen,False,...,False,False,False,False,False,False,False,False,True,False
395,32.0,4,135.0,84,2295,11.6,82,1,dodge,False,...,False,False,False,False,False,False,False,False,False,False
396,28.0,4,120.0,79,2625,18.6,82,1,ford,False,...,False,False,False,False,False,False,False,False,False,False


In [65]:
car.drop('make',axis = 1,inplace=True)
car

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,amc,audi,...,peugeot,plymouth,pontiac,renault,saab,subaru,toyota,triumph,volkswagen,volvo
0,18.0,8,307.0,130,3504,12.0,70,1,False,False,...,False,False,False,False,False,False,False,False,False,False
1,15.0,8,350.0,165,3693,11.5,70,1,False,False,...,False,False,False,False,False,False,False,False,False,False
2,18.0,8,318.0,150,3436,11.0,70,1,False,False,...,False,True,False,False,False,False,False,False,False,False
3,16.0,8,304.0,150,3433,12.0,70,1,True,False,...,False,False,False,False,False,False,False,False,False,False
4,17.0,8,302.0,140,3449,10.5,70,1,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,False,False,...,False,False,False,False,False,False,False,False,False,False
394,44.0,4,97.0,52,2130,24.6,82,2,False,False,...,False,False,False,False,False,False,False,False,True,False
395,32.0,4,135.0,84,2295,11.6,82,1,False,False,...,False,False,False,False,False,False,False,False,False,False
396,28.0,4,120.0,79,2625,18.6,82,1,False,False,...,False,False,False,False,False,False,False,False,False,False


Converting horsepower data type 

In [66]:
car['horsepower'] = pd.to_numeric(car['horsepower'], errors='coerce')

# filling empty values with mean value
mean_hp = car['horsepower'].mean()
car['horsepower'].fillna(mean_hp, inplace=True)

car.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 39 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   amc           398 non-null    bool   
 9   audi          398 non-null    bool   
 10  bmw           398 non-null    bool   
 11  buick         398 non-null    bool   
 12  cadillac      398 non-null    bool   
 13  capri         398 non-null    bool   
 14  chevroelt     398 non-null    bool   
 15  chevrolet     398 non-null    bool   
 16  chrysler      398 non-null    bool   
 17  datsun        398 non-null    bool   
 18  dodge         398 non-null    

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  car['horsepower'].fillna(mean_hp, inplace=True)


Spliting the data into features and target variable

In [67]:
X = car.drop('mpg', axis=1) 
y = car['mpg']  


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Fitting model

In [68]:
# Create a linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)

Mean Squared Error (MSE): 9.68255722000267
R-squared (R2): 0.8199144941587723
