In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import warnings
warnings.filterwarnings("ignore")

sns.set_style("whitegrid")

Assuming we have a somewhat cleaned dataset (removing Model Year missing values), sort by year,...

In [2]:
df = pd.read_csv("cleaned_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40450 entries, 0 to 40449
Data columns (total 11 columns):
 #   Column                                             Non-Null Count  Dtype 
---  ------                                             --------------  ----- 
 0   Unnamed: 0                                         40450 non-null  int64 
 1   Date                                               40450 non-null  int64 
 2   Vehicle Category                                   40450 non-null  object
 3   GVWR Class                                         40450 non-null  object
 4   Fuel Type                                          40450 non-null  object
 5   Model Year                                         40450 non-null  int64 
 6   Fuel Technology                                    40450 non-null  object
 7   Electric Mile Range                                40450 non-null  object
 8   Number of Vehicles Registered at the Same Address  40450 non-null  object
 9   Region           

In [3]:
df = df.drop(columns="Region", axis = 1)

## Data Wrangling

In [4]:
categorical_cols = ["Vehicle Category", "Fuel Type", "Fuel Technology", "Electric Mile Range"]
for col in categorical_cols:
    df[col] = df[col].astype("category")

In [5]:
df.dtypes

Unnamed: 0                                              int64
Date                                                    int64
Vehicle Category                                     category
GVWR Class                                             object
Fuel Type                                            category
Model Year                                              int64
Fuel Technology                                      category
Electric Mile Range                                  category
Number of Vehicles Registered at the Same Address      object
Vehicle Population                                      int64
dtype: object

In [6]:
df["Number of Vehicles Registered at the Same Address"].apply(lambda x: repr(x)).unique()


array(["'≥4'", "'1'", "'2'", "'3'", "'Unknown'"], dtype=object)

In [7]:
# import unicodedata

# ordinal_mapping = {'1': int(1), '2': int(2), '3': int(3), unicodedata.name("\u03C0"): int(4), "Unknown": int(-1)}

# df["Number of Vehicles Registered at the Same Address"] = df["Number of Vehicles Registered at the Same Address"].astype(str).map(ordinal_mapping)

In [8]:
df["Number of Vehicles Registered at the Same Address"] = df["Number of Vehicles Registered at the Same Address"].replace({'1': 1, '2': 2, '3': 3, "\u22654": 4, "Unknown": -1})

df["GVWR Class"] = df["GVWR Class"].replace({"Not Applicable": -1, "Unknown": -1})

In [9]:
df.isnull().sum()

Unnamed: 0                                           0
Date                                                 0
Vehicle Category                                     0
GVWR Class                                           0
Fuel Type                                            0
Model Year                                           0
Fuel Technology                                      0
Electric Mile Range                                  0
Number of Vehicles Registered at the Same Address    0
Vehicle Population                                   0
dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40450 entries, 0 to 40449
Data columns (total 10 columns):
 #   Column                                             Non-Null Count  Dtype   
---  ------                                             --------------  -----   
 0   Unnamed: 0                                         40450 non-null  int64   
 1   Date                                               40450 non-null  int64   
 2   Vehicle Category                                   40450 non-null  category
 3   GVWR Class                                         40450 non-null  object  
 4   Fuel Type                                          40450 non-null  category
 5   Model Year                                         40450 non-null  int64   
 6   Fuel Technology                                    40450 non-null  category
 7   Electric Mile Range                                40450 non-null  category
 8   Number of Vehicles Registered at the Same Address  40450 non-null  int64   


In [14]:
from sklearn.model_selection import train_test_split

# Separate the feature and the target
X = df.drop(columns= "Vehicle Population")
y = df["Vehicle Population"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a pipeline of training

In [17]:
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBClassifier

In [19]:
estimators = ['encoder', TargetEncoder(), 'clf', XGBClassifier(random_state = 42)]

pipe = Pipeline(steps = estimators)