In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
mpg = pd.read_csv("mpg.csv")
mpg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234 entries, 0 to 233
Data columns (total 11 columns):
manufacturer    234 non-null object
model           234 non-null object
displ           234 non-null float64
year            234 non-null int64
cyl             234 non-null int64
trans           234 non-null object
drv             234 non-null object
cty             234 non-null int64
hwy             234 non-null int64
fl              234 non-null object
class           234 non-null object
dtypes: float64(1), int64(4), object(6)
memory usage: 20.2+ KB


In [3]:
mpg.head(3)

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
0,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
1,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
2,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact


In [4]:
# Create the X, independent, variable set
X = mpg.drop(['manufacturer', 'model', 'trans', 'drv', 'cty', 'hwy', 'fl', 'class'], axis=1)
print(X.head(3))

# Create the Y, dependent, variable set
y = mpg[['hwy']]
print(y.head(3))


   displ  year  cyl
0    1.8  1999    4
1    1.8  1999    4
2    2.0  2008    4
   hwy
0   29
1   29
2   31


In [5]:
# Split X and y into test and training samples

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [6]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175 entries, 110 to 37
Data columns (total 3 columns):
displ    175 non-null float64
year     175 non-null int64
cyl      175 non-null int64
dtypes: float64(1), int64(2)
memory usage: 5.5 KB


In [7]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59 entries, 199 to 47
Data columns (total 3 columns):
displ    59 non-null float64
year     59 non-null int64
cyl      59 non-null int64
dtypes: float64(1), int64(2)
memory usage: 1.8 KB


In [8]:
# Train the model
from sklearn.linear_model import LinearRegression

regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [9]:
# View the model coefficients
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for displ is -1.9165592568078726
The coefficient for year is 0.12656967512127304
The coefficient for cyl is -1.417714403935386


In [10]:
# View the intercept
intercept = regression_model.intercept_[0]

print("The intercept for our model is {}".format(intercept))

The intercept for our model is -215.202820113819


In [11]:
# View the R-squared value of the model
# 0.718 with year, displ, and cyl, or roughly 72% accurate

regression_model.score(X_test, y_test)

0.71804984077352574

In [12]:
# What is RMSE of the predictions?
# 2.96 with year, displ, cyl

from sklearn.metrics import mean_squared_error
import math

y_predict = regression_model.predict(X_test)
regression_model_mse = mean_squared_error(y_predict, y_test)
print("Mean Squared Error =", regression_model_mse)
rmse = math.sqrt(regression_model_mse)
print("RMSE =", rmse)

Mean Squared Error = 8.78783811417
RMSE = 2.964428800658368


In [15]:
# Predict mileage using (disp, year, cyl) for Legacy

regression_model.predict([[2.5, 2014, 4]])

array([[ 29.24624982]])