In [10]:
import pandas as pd
import numpy as np
from sklearn import linear_model

In [11]:
df = pd.read_csv('../../datasets/mpg.csv')

In [12]:
df.head()

Unnamed: 0.1,Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
0,1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
1,2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
2,3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
3,4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
4,5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


In [13]:
df.head().transpose()

Unnamed: 0,0,1,2,3,4
Unnamed: 0,1,2,3,4,5
manufacturer,audi,audi,audi,audi,audi
model,a4,a4,a4,a4,a4
displ,1.8,1.8,2,2,2.8
year,1999,1999,2008,2008,1999
cyl,4,4,4,4,6
trans,auto(l5),manual(m5),manual(m6),auto(av),auto(l5)
drv,f,f,f,f,f
cty,18,21,20,21,16
hwy,29,29,31,30,26


In [14]:
# drop the id column (unnamed)

df = df.drop('Unnamed: 0', 1)
df.head().transpose()

Unnamed: 0,0,1,2,3,4
manufacturer,audi,audi,audi,audi,audi
model,a4,a4,a4,a4,a4
displ,1.8,1.8,2,2,2.8
year,1999,1999,2008,2008,1999
cyl,4,4,4,4,6
trans,auto(l5),manual(m5),manual(m6),auto(av),auto(l5)
drv,f,f,f,f,f
cty,18,21,20,21,16
hwy,29,29,31,30,26
fl,p,p,p,p,p


In [16]:
# build training and testing sets

train = df.sample(frac = 7/10, random_state = 1)

In [17]:
test = df.drop(train.index)

In [18]:
train.count()

manufacturer    164
model           164
displ           164
year            164
cyl             164
trans           164
drv             164
cty             164
hwy             164
fl              164
class           164
dtype: int64

In [19]:
test.count()

manufacturer    70
model           70
displ           70
year            70
cyl             70
trans           70
drv             70
cty             70
hwy             70
fl              70
class           70
dtype: int64

In [20]:
train_X = np.asarray(train[['displ', 'class']])
train_y = np.asarray(train['hwy'])

In [21]:
test_X = np.asarray(test[['displ', 'class']])
test_y = np.asarray(test['hwy'])

In [22]:
# create the linear regression model

rgr = linear_model.LinearRegression()

The LinearRegressoin() method implementation in sklearn needs floated inputs. The 'class' column is full of strings. To get arounds this, we'll have to convert this column into dummy variables. 

That means we'll have to create a column for each value in the class. If that observation at that point is of that class, then that particular column is 1, otherwise it will be 0. 

In [24]:
class_dummies = pd.get_dummies(df['class'])
class_dummies.head()

Unnamed: 0,2seater,compact,midsize,minivan,pickup,subcompact,suv
0,0,1,0,0,0,0,0
1,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0
3,0,1,0,0,0,0,0
4,0,1,0,0,0,0,0


In [26]:
df2 = pd.concat([df, class_dummies], axis = 1)
df2.head().transpose()

Unnamed: 0,0,1,2,3,4
manufacturer,audi,audi,audi,audi,audi
model,a4,a4,a4,a4,a4
displ,1.8,1.8,2,2,2.8
year,1999,1999,2008,2008,1999
cyl,4,4,4,4,6
trans,auto(l5),manual(m5),manual(m6),auto(av),auto(l5)
drv,f,f,f,f,f
cty,18,21,20,21,16
hwy,29,29,31,30,26
fl,p,p,p,p,p


In [27]:
# split up the dataset again to train and test.

train = df2.sample(frac = 7/10, random_state = 1)
test = df2.drop(train.index)

In [29]:
train_X = np.asarray(train[['displ', '2seater', 'compact', 'midsize', \
                            'minivan', 'pickup', 'subcompact', 'suv']])

train_y = np.asarray(train['hwy'])

In [31]:
test_X = np.asarray(test[['displ', '2seater', 'compact', 'midsize', \
                         'minivan', 'pickup', 'subcompact', 'suv']])

test_y = np.asarray(test['hwy'])

In [33]:
rgr.fit(train_X, train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [34]:
rgr.intercept_

32.995677220025996

In [35]:
print('Coffefients: \n', rgr.coef_)

Coffefients: 
 [-2.42737226  7.99592858  0.75584649  1.17086032 -1.78384786 -5.48121739
  1.33358906 -3.99115918]


In [37]:
# zip it up to see the name of the variable next to the coefficient

z = zip(['displ', '2seater', 'compact', 'midsize', \
         'minivan', 'pickup', 'subcompact', 'suv'], rgr.coef_)

list(z)

[('displ', -2.4273722565586482),
 ('2seater', 7.9959285758845358),
 ('compact', 0.75584648787547315),
 ('midsize', 1.1708603177117243),
 ('minivan', -1.7838478624497764),
 ('pickup', -5.4812173947250455),
 ('subcompact', 1.3335890606586249),
 ('suv', -3.991159184955535)]

In [38]:
# looking at the R-Squared value on the training set
# Variance score = 1 is perfect prediction

print('R Squared: {}'.format(rgr.score(train_X, train_y)))

R Squared: 0.8045617859827147


In [41]:
# predict the highway mpg given that we 
# know the displacement and class already

rgr.predict(test_X)

array([ 29.38225365,  28.89677919,  29.38225365,  29.38225365,
        28.89677919,  27.36989522,  16.13944508,  14.4402845 ,
        27.15558393,  27.15558393,  25.94189781,  25.94189781,
        15.16849617,  25.38613594,  23.20150091,  23.20150091,
        18.53318248,  18.04770802,  16.10581022,  19.53776623,
        17.59586843,  16.3821823 ,  14.68302172,  16.10581022,
        16.10581022,  14.89212409,  13.67843796,  15.89670785,
        19.29502901,  19.29502901,  16.34854745,  23.1633539 ,
        30.44547067,  29.95999622,  28.34084412,  28.34084412,
        29.47452177,  27.77536119,  17.59586843,  15.16849617,
        14.19754727,  18.80955456,  17.83860565,  15.89670785,
        15.89670785,  19.29502901,  17.83860565,  16.86765675,
        27.92583029,  27.92583029,  28.0981069 ,  25.67073464,
        25.67073464,  15.4112334 ,  24.94252296,  24.94252296,
        20.75145236,  17.59586843,  28.34084412,  25.74119526,
        29.38225365,  20.96055473,  20.96055473,  19.26