In [1]:
# Linear Regression 

import numpy as np
import numpy.random as random
import scipy as sp
from pandas import Series, DataFrame
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

import sklearn

%precision 3

'%.3f'

In [2]:
import requests, zipfile
import io
auto_data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
res = requests.get(auto_data_url).content
auto_data = pd.read_csv(io.StringIO(res.decode('utf-8')), header=None)
auto_data.columns = ["symboling","normalized-losses","make","fuel-type"
                     ,"aspiration","num-of-doors","body-style","drive-wheels","engine-location","wheel-base","length"
                   ,"width","height","curb-weight","engine-type","num-of-cylinders","engine-size","fuel-system"
                    ,"bore","stroke","compression-ratio","horsepower","peak-rpm","city-mpg","highway-mpg","price"]

In [3]:
auto_data.shape

(205, 26)

In [4]:
auto_data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [5]:
# Count the number of '?'
auto = auto_data[['price','horsepower','width','height']]
auto.isin(['?']).sum()


price         4
horsepower    2
width         0
height        0
dtype: int64

In [6]:
# Drop the missing values. 
auto = auto.replace('?', np.nan).dropna()
auto.shape

(199, 4)

In [7]:
auto.dtypes

price          object
horsepower     object
width         float64
height        float64
dtype: object

In [8]:
# Convert non-numeric to numeric type. 
auto = auto.assign(price = pd.to_numeric(auto.price))
auto = auto.assign(horsepower = pd.to_numeric(auto.horsepower))
auto.dtypes

price           int64
horsepower      int64
width         float64
height        float64
dtype: object

In [9]:
auto.corr()

Unnamed: 0,price,horsepower,width,height
price,1.0,0.810533,0.753871,0.13499
horsepower,0.810533,1.0,0.615315,-0.087407
width,0.753871,0.615315,1.0,0.309223
height,0.13499,-0.087407,0.309223,1.0


In [10]:
# Linear Regression


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Independent variable
X = auto.drop('price', axis=1)
y = auto['price']

# Split into training data and test data. 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

# Training. 
model = LinearRegression()
model.fit(X_train, y_train)

print('r score(train): {:.3f}'.format(model.score(X_train, y_train)))
print('r score(test): {:.3f}'.format(model.score(X_test, y_test)))
print()
print('regression coef:\n{}'.format(pd.Series(model.coef_, index=X.columns)))
print('intercept: {:.3f}'.format(model.intercept_))


r score(train): 0.733
r score(test): 0.737

regression coef:
horsepower      81.651078
width         1829.174506
height         229.510077
dtype: float64
intercept: -128409.046


In [12]:
# R8-1
auto = auto_data[['price','width','engine-size']]
auto = auto.replace('?', np.nan).dropna()
auto.shape

(201, 3)

In [13]:
X = auto.drop('price', axis=1)
y = auto['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

model = LinearRegression()
model.fit(X_train, y_train)

print('r score(train): {:.3f}'.format(model.score(X_train, y_train)))
print('r score(test): {:.3f}'.format(model.score(X_test, y_test)))

r score(train): 0.783
r score(test): 0.778


In [15]:
auto_data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [16]:
auto_data.columns

Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price'],
      dtype='object')

In [19]:
auto = auto_data[['price','num-of-cylinders','horsepower','highway-mpg','stroke']]
auto = auto.replace('?', np.nan).dropna()
auto.shape

(195, 5)

In [24]:
X = auto.drop('price', axis=1)
y = auto['price']




In [25]:
X['num-of-cylinders'].value_counts()

four      155
six        24
five       10
eight       4
twelve      1
three       1
Name: num-of-cylinders, dtype: int64

In [23]:
cylin_map = {
    'two': 2, 
    'three': 3, 
    'four': 4, 
    'five': 5,
    'six': 6, 
    'eight': 8, 
    'twelve': 12
}
cylin_map

{'two': 2,
 'three': 3,
 'four': 4,
 'five': 5,
 'six': 6,
 'eight': 8,
 'twelve': 12}

In [27]:
X['num_cylin'] = X['num-of-cylinders'].map(cylin_map)
X.head()

Unnamed: 0,num-of-cylinders,horsepower,highway-mpg,stroke,num_cylin
0,four,111,27,2.68,4
1,four,111,27,2.68,4
2,six,154,26,3.47,6
3,four,102,30,3.4,4
4,five,115,22,3.4,5


In [29]:
X = X.drop('num-of-cylinders', axis=1)
X.head()

Unnamed: 0,horsepower,highway-mpg,stroke,num_cylin
0,111,27,2.68,4
1,111,27,2.68,4
2,154,26,3.47,6
3,102,30,3.4,4
4,115,22,3.4,5


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

model = LinearRegression()
model.fit(X_train, y_train)

print('r score(train): {:.3f}'.format(model.score(X_train, y_train)))
print('r score(test): {:.3f}'.format(model.score(X_test, y_test)))

# Result. overfitting? 
# r score(train): 0.801
# r score(test): 0.555

r score(train): 0.801
r score(test): 0.555
