## Import Statements

In [91]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display

## Loading Data

In [92]:
#Load in data and display the first few rows
auto_mpg_data = pd.read_csv('../../Datasets/Regression/auto_mpg.csv')
print('Data Loaded Successfully')
display(auto_mpg_data.head())
#Note that origin is 1=USA, 2=Europe, 3=Japan

#Check Shape
num_samples = auto_mpg_data.shape[0]
num_features = auto_mpg_data.shape[1]
print('Number of Samples:', num_samples)
print('Number of Features:', num_features)
print('Data Shape:', auto_mpg_data.shape) #(Samples, Features)

Data Loaded Successfully


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


Number of Samples: 398
Number of Features: 9
Data Shape: (398, 9)


## Preprocessing Data

In [93]:
#Drop car name and origin column
auto_mpg_data = auto_mpg_data.drop(columns=['car name','origin'])
print('Car name & origin column dropped\n')
display(auto_mpg_data.head())

#Check for and drop null values (the null values are represented as '?')
auto_mpg_data['horsepower'] = auto_mpg_data['horsepower'].replace('?', np.nan)
auto_mpg_data = auto_mpg_data.dropna(axis=0, how='any')
num_samples_after_drop = auto_mpg_data.shape[0]
print(f'\n{num_samples - num_samples_after_drop} null containing samples dropped')
print('Number of Samples after drop:', num_samples_after_drop)


Car name & origin column dropped



Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year
0,18.0,8,307.0,130,3504,12.0,70
1,15.0,8,350.0,165,3693,11.5,70
2,18.0,8,318.0,150,3436,11.0,70
3,16.0,8,304.0,150,3433,12.0,70
4,17.0,8,302.0,140,3449,10.5,70



6 null containing samples dropped
Number of Samples after drop: 392


## Examining Data

In [None]:
#Print all the keys in the dataset
print('Keys in the dataset:')
print(auto_mpg_data.keys(), end='\n\n')

#Display correlation matrix
corr_mat = auto_mpg_data.corr()
corr_mat = (
    corr_mat
    .style
    .background_gradient(
        cmap='coolwarm',
        vmin=-1,
        vmax=1,
        axis=None
    )
)
display(corr_mat)


Keys in the dataset:
Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year'],
      dtype='object')



Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year
mpg,1.0,-0.777618,-0.805127,-0.778427,-0.832244,0.423329,0.580541
cylinders,-0.777618,1.0,0.950823,0.842983,0.897527,-0.504683,-0.345647
displacement,-0.805127,0.950823,1.0,0.897257,0.932994,-0.5438,-0.369855
horsepower,-0.778427,0.842983,0.897257,1.0,0.864538,-0.689196,-0.416361
weight,-0.832244,0.897527,0.932994,0.864538,1.0,-0.416839,-0.30912
acceleration,0.423329,-0.504683,-0.5438,-0.689196,-0.416839,1.0,0.290316
model year,0.580541,-0.345647,-0.369855,-0.416361,-0.30912,0.290316,1.0


## Setting Target and Features

In [95]:
#Set the target variable
target_data = auto_mpg_data['mpg']
print('Target variable set')

#Drop the target variable from the data
auto_mpg_data = auto_mpg_data.drop(columns=['mpg'])
print('Target variable dropped from data')

Target variable set
Target variable dropped from data
