<a href="https://colab.research.google.com/github/leemichaelwaters/ml-examples/blob/main/1_Predict_auto_mpg_(linear_regression).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Summary
- Import auto data
- EDA performed
  - Dataset has 8 features and mpg label
  - 1 feature dropped
  - Null horsepower replaced with mean
  - One-hot encoding for categorical variables
- Data split into training (80%) and testing (20%) sets
- Fit linear regression to training data
- Model used to generate predictions on test data
- Mean squared error of predictions is 8.3
- Each prediction is approximately +/- 3 mpg off target

# Import data

In [None]:
import pandas as pd

url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
colNames = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
            'acceleration', 'model', 'origin', 'car_name']
auto = pd.read_csv(url, names=colNames, na_values='?', sep='\s+')

# Exploratory data analysis

In [None]:
# Initial view of data
print('Shape:')
print(auto.shape)

print('')

print('Head:')
print(auto.head())

Shape:
(398, 9)

Head:
    mpg  cylinders  displacement  horsepower  weight  acceleration  model  \
0  18.0          8         307.0       130.0  3504.0          12.0     70   
1  15.0          8         350.0       165.0  3693.0          11.5     70   
2  18.0          8         318.0       150.0  3436.0          11.0     70   
3  16.0          8         304.0       150.0  3433.0          12.0     70   
4  17.0          8         302.0       140.0  3449.0          10.5     70   

   origin                   car_name  
0       1  chevrolet chevelle malibu  
1       1          buick skylark 320  
2       1         plymouth satellite  
3       1              amc rebel sst  
4       1                ford torino  


In [None]:
# Drop car name because irrelevant
auto.drop(columns='car_name', inplace=True)

# Check for nulls
print('Info:')
print(auto.info()) # 6 null values in horsepower

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model         398 non-null    int64  
 7   origin        398 non-null    int64  
dtypes: float64(5), int64(3)
memory usage: 25.0 KB
None


In [None]:
# Replace null horsepower with mean
auto.fillna(auto['horsepower'].mean(), inplace = True)

# Re-check for nulls
print('Info:')
print(auto.info()) # no null values

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model         398 non-null    int64  
 7   origin        398 non-null    int64  
dtypes: float64(5), int64(3)
memory usage: 25.0 KB
None


In [None]:
# One-hot encoding of categorical variables
origin_oh = pd.get_dummies(auto.origin, prefix='origin')
auto = pd.concat([auto, origin_oh], axis=1)
auto.drop(columns='origin', inplace=True)

print('Head:')
print(auto.head())

Head:
    mpg  cylinders  displacement  horsepower  weight  acceleration  model  \
0  18.0          8         307.0       130.0  3504.0          12.0     70   
1  15.0          8         350.0       165.0  3693.0          11.5     70   
2  18.0          8         318.0       150.0  3436.0          11.0     70   
3  16.0          8         304.0       150.0  3433.0          12.0     70   
4  17.0          8         302.0       140.0  3449.0          10.5     70   

   origin_1  origin_2  origin_3  
0         1         0         0  
1         1         0         0  
2         1         0         0  
3         1         0         0  
4         1         0         0  


# Train and test model

In [None]:
# Import packages
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Define model
model = LinearRegression(fit_intercept=True)                                    # y-intercept not set to 0

# Define features, labels
X = auto.drop(columns='mpg')
y = auto['mpg']

# Train/test split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=42, test_size=.2)
print('Xtrain, Xtest, ytrain, ytest:')
print(Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape)

print('')

# Fit model
model.fit(Xtrain, ytrain)
print('Model coefficients:')
print(model.coef_)

print('')

# Test model
y_model = model.predict(Xtest)

# Assess accuracy
print('MSE:')
print(mean_squared_error(ytest, y_model))

Xtrain, Xtest, ytrain, ytest:
(318, 9) (80, 9) (318,) (80,)

Model coefficients:
[-0.16373048  0.01958399 -0.01334457 -0.00707275  0.07335016  0.82739747
 -1.86404853  1.07519552  0.78885302]

MSE:
8.339142500255893
