In [1]:
# import required libraries

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures

In [2]:
# read the file tp dataframe
# print first 5 rows

df = pd.read_csv("../Auto.csv")

df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [3]:
# check the datatype to ensure no surprises

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           397 non-null    float64
 1   cylinders     397 non-null    int64  
 2   displacement  397 non-null    float64
 3   horsepower    397 non-null    object 
 4   weight        397 non-null    int64  
 5   acceleration  397 non-null    float64
 6   year          397 non-null    int64  
 7   origin        397 non-null    int64  
 8   name          397 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.0+ KB


In [4]:
# my horsepower is always an object for some reason
# force it to be a float

df["horsepower"] = df["horsepower"].apply(pd.to_numeric, errors="coerce")

In [5]:
# recheck the data for any more surprises

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           397 non-null    float64
 1   cylinders     397 non-null    int64  
 2   displacement  397 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        397 non-null    int64  
 5   acceleration  397 non-null    float64
 6   year          397 non-null    int64  
 7   origin        397 non-null    int64  
 8   name          397 non-null    object 
dtypes: float64(4), int64(4), object(1)
memory usage: 28.0+ KB


In [6]:
# noticed there are 5 null rows in horsepower so drop them

df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 396
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   year          392 non-null    int64  
 7   origin        392 non-null    int64  
 8   name          392 non-null    object 
dtypes: float64(4), int64(4), object(1)
memory usage: 30.6+ KB


In [7]:
# assign X and y variables

X = df[["horsepower"]]
y = df["mpg"]

In [8]:
# similar to train <- sample(392, 196) in R. Use train test split to split the data into half
# seed is the same as random state. I like use 5, most people use 42 (meaning of life inside joke or something)
# just remember to put random_state else your result will change every time you run it

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=5)

In [9]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [10]:
# get the prediction and then its mean from the trained data

prediction = lr.predict(X_test)

mean_squared_error(y_test, prediction)

23.442643969985735

In [11]:
# fit transform the X train and test for a quadratic regression

poly_2 = PolynomialFeatures(degree=2)
X_train_2 = poly_2.fit_transform(X_train)
X_test_2 = poly_2.fit_transform(X_test)

In [12]:
lr_poly_2 = LinearRegression()
lr_poly_2.fit(X_train_2, y_train)

prediction_poly_2 = lr_poly_2.predict(X_test_2)
mean_squared_error(y_test, prediction_poly_2)

18.550198801910284

In [13]:
# fit transform the X train and test for a cubic regression

poly_3 = PolynomialFeatures(degree=3)
X_train_3 = poly_3.fit_transform(X_train)
X_test_3 = poly_3.fit_transform(X_test)

In [14]:
lr_poly_3 = LinearRegression()
lr_poly_3.fit(X_train_3, y_train)

prediction_poly_3 = lr_poly_3.predict(X_test_3)
mean_squared_error(y_test, prediction_poly_3)

18.59522229454283