# Auto MPG

# NJIT
John Morrison
Joseph Bennett

## Necessary Libraries

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

## Read data

In [2]:
data = pd.read_csv('auto-mpg.data', sep='\s+')
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [3]:
data.shape

(398, 9)

## Data Preprocessing

In [4]:
for column in data:
    print("{:14}: {}".format(column, data[column].dtype))

mpg           : float64
cylinders     : int64
displacement  : float64
horsepower    : object
weight        : float64
acceleration  : float64
model_year    : int64
origin        : int64
car_name      : object


In [5]:
#Drops any data that is not available, is this a good idea?
data['horsepower'] = pd.to_numeric(data['horsepower'], errors='coerce')
data = data.dropna(subset=['horsepower'])
for column in data:
    print("{:14}: {}".format(column, data[column].dtype))

mpg           : float64
cylinders     : int64
displacement  : float64
horsepower    : float64
weight        : float64
acceleration  : float64
model_year    : int64
origin        : int64
car_name      : object


In [6]:
data = data.drop(['car_name'], axis=1)

## Separate data into training set (80%) and test set (20%)

In [7]:
rows = data.shape[0]
train_num = round(rows * .8)
train_set = data[:train_num]
test_set = data[train_num:]

### Set the dependent (target) and independent sets

In [8]:
X_train = train_set.drop(['mpg'],axis=1)
y_train = pd.DataFrame(train_set, columns=['mpg'])
X_train.values.reshape(-1, 7)
y_train.values.reshape(-1, 1)
X_test = test_set.drop(['mpg'],axis=1)
y_test = pd.DataFrame(test_set, columns=['mpg'])

## Independent 

In [9]:
X_train.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,8,307.0,130.0,3504.0,12.0,70,1
1,8,350.0,165.0,3693.0,11.5,70,1
2,8,318.0,150.0,3436.0,11.0,70,1
3,8,304.0,150.0,3433.0,12.0,70,1
4,8,302.0,140.0,3449.0,10.5,70,1


## Target

In [10]:
y_train.head()

Unnamed: 0,mpg
0,18.0
1,15.0
2,18.0
3,16.0
4,17.0


## Select and fit the model

In [11]:
ols = LinearRegression()
model = ols.fit(X_train, y_train)

## Generate Predictions

In [12]:
mpg_y_pred = ols.predict(X_test)

In [13]:
ols.coef_

array([[-0.51555173,  0.01018991, -0.01789524, -0.00557231, -0.11394508,
         0.58120339,  0.85066081]])

## Evaluate the Model

In [14]:
print("Mean Squared Error: {:.2f}".format(mean_squared_error(y_test, mpg_y_pred)))

Mean Squared Error: 27.84


In [15]:
print("Variance Score: {:.2f}".format(r2_score(y_test, mpg_y_pred)))

Variance Score: 0.23
