In [1]:
# Import dependencies
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
# Get the data from DataCamp
df = pd.read_csv('https://assets.datacamp.com/production/repositories/628/datasets/3781d588cf7b04b1e376c7e9dda489b3e6c7465b/auto.csv')

In [3]:
df.head()

Unnamed: 0,mpg,displ,hp,weight,accel,origin,size
0,18.0,250.0,88,3139,14.5,US,15.0
1,9.0,304.0,193,4732,18.5,US,20.0
2,36.1,91.0,60,1800,16.4,Asia,10.0
3,18.5,250.0,98,3525,19.0,US,15.0
4,34.3,97.0,78,2188,15.8,Europe,10.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   mpg     392 non-null    float64
 1   displ   392 non-null    float64
 2   hp      392 non-null    int64  
 3   weight  392 non-null    int64  
 4   accel   392 non-null    float64
 5   origin  392 non-null    object 
 6   size    392 non-null    float64
dtypes: float64(4), int64(2), object(1)
memory usage: 21.6+ KB


The 'origin' column is of 'object' type so we need to convert it to dummy variables as a pre-processing step before using ML

# Plot a boxplot of the origin column
sns.set_theme()
fig, ax = plt.subplots(figsize=(10,7))
sns.boxplot(x='origin', y='mpg', data=df)

In [5]:
# Encode the origin columnn into dummy variables
df_origin = pd.get_dummies(df)
df_origin.head()

Unnamed: 0,mpg,displ,hp,weight,accel,size,origin_Asia,origin_Europe,origin_US
0,18.0,250.0,88,3139,14.5,15.0,0,0,1
1,9.0,304.0,193,4732,18.5,20.0,0,0,1
2,36.1,91.0,60,1800,16.4,10.0,1,0,0
3,18.5,250.0,98,3525,19.0,15.0,0,0,1
4,34.3,97.0,78,2188,15.8,10.0,0,1,0


In [6]:
# Since the origin column can only have 3 possible values we need to drop one column as it is implicit given the other two
df_origin = df_origin.drop('origin_Asia', axis=1)
df_origin.head()

Unnamed: 0,mpg,displ,hp,weight,accel,size,origin_Europe,origin_US
0,18.0,250.0,88,3139,14.5,15.0,0,1
1,9.0,304.0,193,4732,18.5,20.0,0,1
2,36.1,91.0,60,1800,16.4,10.0,0,0
3,18.5,250.0,98,3525,19.0,15.0,0,1
4,34.3,97.0,78,2188,15.8,10.0,1,0


# Linear regression with dummy variables

In [7]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge

In [8]:
X = df_origin.drop('mpg', axis=1).values
y = df_origin.mpg.values

In [9]:
# Create training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
# Instantiate and train model
ridge = Ridge(alpha=0.5, normalize=True).fit(X_train, y_train)

In [11]:
ridge.score(X_test, y_test)

0.7190645190217895