In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

  from pandas.core import datetools


In [2]:
mpg = pd.read_csv("mpg.csv")
mpg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234 entries, 0 to 233
Data columns (total 11 columns):
manufacturer    234 non-null object
model           234 non-null object
displ           234 non-null float64
year            234 non-null int64
cyl             234 non-null int64
trans           234 non-null object
drv             234 non-null object
cty             234 non-null int64
hwy             234 non-null int64
fl              234 non-null object
class           234 non-null object
dtypes: float64(1), int64(4), object(6)
memory usage: 20.2+ KB


In [3]:
mpg.head(3)

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
0,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
1,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
2,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact


In [15]:
# Create the X, independent, variable set
X = mpg.drop(['manufacturer', 'model', 'trans', 'drv', 'cty', 'hwy', 'fl', 'class', 'year'], axis=1)
print(X.head(3))

# Create the Y, dependent, variable set
y = mpg[['hwy']]
print(y.head(3))


   displ  cyl
0    1.8    4
1    1.8    4
2    2.0    4
   hwy
0   29
1   29
2   31


In [16]:
# Split X and y into test and training samples

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [17]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175 entries, 110 to 37
Data columns (total 2 columns):
displ    175 non-null float64
cyl      175 non-null int64
dtypes: float64(1), int64(1)
memory usage: 4.1 KB


In [18]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59 entries, 199 to 47
Data columns (total 2 columns):
displ    59 non-null float64
cyl      59 non-null int64
dtypes: float64(1), int64(1)
memory usage: 1.4 KB


In [19]:
# Train the model
from sklearn.linear_model import LinearRegression

regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [20]:
# Can we get P values using statsmodels?
import statsmodels.api as sm

p_X = X_train
p_y = y_train

X2 = sm.add_constant(p_X)
est = sm.OLS(p_y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                    hwy   R-squared:                       0.578
Model:                            OLS   Adj. R-squared:                  0.573
Method:                 Least Squares   F-statistic:                     117.8
Date:                Fri, 15 Dec 2017   Prob (F-statistic):           5.85e-33
Time:                        14:59:41   Log-Likelihood:                -487.85
No. Observations:                 175   AIC:                             981.7
Df Residuals:                     172   BIC:                             991.2
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         38.1714      1.243     30.706      0.0

In [21]:
# View the model coefficients
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for displ is -1.80856954094
The coefficient for cyl is -1.45133809781


In [22]:
# View the intercept
intercept = regression_model.intercept_[0]

print("The intercept for our model is {}".format(intercept))

The intercept for our model is 38.1714157909


In [23]:
# View the R-squared value of the model
# 0.718 with year, displ, and cyl, or roughly 72% accurate

regression_model.score(X_test, y_test)

0.69317499296128426

In [24]:
# What is RMSE of the predictions?
# 2.96 with year, displ, cyl

from sklearn.metrics import mean_squared_error
import math

y_predict = regression_model.predict(X_test)
regression_model_mse = mean_squared_error(y_predict, y_test)
print("Mean Squared Error =", regression_model_mse)
rmse = math.sqrt(regression_model_mse)
print("RMSE =", rmse)

('Mean Squared Error =', 9.5631387427958927)
('RMSE =', 3.0924324960774636)


In [26]:
# Predict mileage using (disp, year, cyl) for Legacy

regression_model.predict([[2.5, 4]])

array([[ 27.84463955]])