In [18]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

In [19]:
df = pd.read_csv("../data/mpg_dataset.csv")

## Data Cleaning

In [20]:
df[df["horsepower"].isna()]

Unnamed: 0.1,Unnamed: 0,displacement,cylinders,horsepower,weight,acceleration,model_year,origin,mpg
32,32,98.0,4,,2046,19.0,71,1,25.0
126,126,200.0,6,,2875,17.0,74,1,21.0
330,330,85.0,4,,1835,17.3,80,2,40.9
336,336,140.0,4,,2905,14.3,80,1,23.6
354,354,100.0,4,,2320,15.8,81,2,34.5
374,374,151.0,4,,3035,20.5,82,1,23.0


In [21]:
df["weight_cat"] = pd.cut(
    df["weight"],
    bins=[0, 500, 2000, 3000, 4000, 4500, np.inf],
    labels=[1,2,3,4,5,6]
)


In [22]:
df.columns

Index(['Unnamed: 0', 'displacement', 'cylinders', 'horsepower', 'weight',
       'acceleration', 'model_year', 'origin', 'mpg', 'weight_cat'],
      dtype='object')

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Unnamed: 0    398 non-null    int64   
 1   displacement  398 non-null    float64 
 2   cylinders     398 non-null    int64   
 3   horsepower    392 non-null    float64 
 4   weight        398 non-null    int64   
 5   acceleration  398 non-null    float64 
 6   model_year    398 non-null    int64   
 7   origin        398 non-null    int64   
 8   mpg           398 non-null    float64 
 9   weight_cat    398 non-null    category
dtypes: category(1), float64(4), int64(5)
memory usage: 28.7 KB


In [24]:
df.describe()

Unnamed: 0.1,Unnamed: 0,displacement,cylinders,horsepower,weight,acceleration,model_year,origin,mpg
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0,398.0,398.0
mean,198.5,193.425879,5.454774,104.469388,2970.424623,15.56809,76.01005,1.572864,23.514573
std,115.036951,104.269838,1.701004,38.49116,846.841774,2.757689,3.697627,0.802055,7.815984
min,0.0,68.0,3.0,46.0,1613.0,8.0,70.0,1.0,9.0
25%,99.25,104.25,4.0,75.0,2223.75,13.825,73.0,1.0,17.5
50%,198.5,148.5,4.0,93.5,2803.5,15.5,76.0,1.0,23.0
75%,297.75,262.0,8.0,126.0,3608.0,17.175,79.0,2.0,29.0
max,397.0,455.0,8.0,230.0,5140.0,24.8,82.0,3.0,46.6


Simple Imputation for horsepower column

In [25]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")
df = pd.DataFrame(imputer.fit_transform(X=df), columns=df.columns, index=df.index)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    398 non-null    float64
 1   displacement  398 non-null    float64
 2   cylinders     398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    float64
 7   origin        398 non-null    float64
 8   mpg           398 non-null    float64
 9   weight_cat    398 non-null    float64
dtypes: float64(10)
memory usage: 31.2 KB


In [27]:
to_drop_cols = ["Unnamed: 0"]
df.drop(to_drop_cols, axis=1, inplace=True)

In [28]:
df.head(2)

Unnamed: 0,displacement,cylinders,horsepower,weight,acceleration,model_year,origin,mpg,weight_cat
0,307.0,8.0,130.0,3504.0,12.0,70.0,1.0,18.0,4.0
1,350.0,8.0,165.0,3693.0,11.5,70.0,1.0,15.0,4.0


In [29]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

continuous_features = ["weight", "acceleration"]
ordinal_features = ["cylinders"]
nominal_features = ["origin"]

ordinal_continous_pipe = Pipeline([
    ("std_scaler_ord", StandardScaler()),
])

full_pipeline = ColumnTransformer(
    [
        ("ordinal_pipe", ordinal_continous_pipe, ordinal_features),
        ("cont_pipe", ordinal_continous_pipe, continuous_features),
        ("nominal_pipe", OneHotEncoder(), nominal_features)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(df.drop("mpg",axis=1), df["mpg"], stratify=df["weight_cat"])
X_train_processed = full_pipeline.fit_transform(X_train.drop("weight_cat", axis=1))
X_test_processed = full_pipeline.transform(X_test.drop("weight_cat", axis=1))

ohe = full_pipeline.named_transformers_['nominal_pipe']
ohe_columns = ohe.get_feature_names_out(nominal_features)
column_names = ordinal_features + continuous_features + list(ohe_columns)

X_train_processed = pd.DataFrame(X_train_processed, columns=column_names)
X_test_processed = pd.DataFrame(X_test_processed, columns=column_names)

df_train_processed = X_train_processed.join(y_train.reset_index(drop=True))
df_test_processed = X_test_processed.join(y_test.reset_index(drop=True))

In [30]:
import joblib 

joblib.dump(full_pipeline, "../models/data_cleaning_pipeline.pkl")

['../models/data_cleaning_pipeline.pkl']

In [31]:
df_train_processed.head()

Unnamed: 0,cylinders,weight,acceleration,origin_1.0,origin_2.0,origin_3.0,mpg
0,0.325522,1.107825,1.957373,1.0,0.0,0.0,17.0
1,0.325522,-0.245703,-0.014558,1.0,0.0,0.0,18.0
2,-0.857474,-0.367557,0.702508,0.0,1.0,0.0,25.0
3,1.508519,0.22618,-1.269423,1.0,0.0,0.0,13.0
4,-0.857474,-1.182302,0.523242,0.0,0.0,1.0,32.0


In [32]:
df_train_processed = df_train_processed.rename(columns={"origin_1.0": "USA", "origin_2.0":"Europe", "origin_3.0":"Asia"})
df_test_processed = df_test_processed.rename(columns={"origin_1.0": "USA", "origin_2.0":"Europe", "origin_3.0":"Asia"})

In [33]:
df_test_processed.to_csv("../data/test_set_cleaned.csv")

In [34]:
df_train_processed.to_csv("../data/train_set_cleaned.csv")