In [1]:
sys.path.append(os.path.abspath(os.path.join('..','./data/', './clean-data/')))
sys.path.append(os.path.abspath(os.path.join('..','./scripts/')))


import pandas as pd
from pathlib import Path
from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

import xgboost as xgb

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression

import sys, os

import matplotlib.pyplot as plt

# Accuracy score
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score

from sklearn.neighbors import KNeighborsRegressor


from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.inspection import DecisionBoundaryDisplay

from sklearn.metrics import ConfusionMatrixDisplay

In [2]:

def remove_missing_values(fuel_df, drop_smog=True, rating_column='co2_rating', drop_column='smog_rating'):

    # Drop smog_rating from non_na_rating
    if drop_smog:
        fuel_df.drop(columns=[drop_column], inplace=True)
    else:
        pass
    fuel_df['number_of_gears'].fillna(0, inplace=True)

    # Set up data pipeline - goal is to predict co2_rating 
    na_rating = fuel_df[fuel_df[rating_column].isna()]
    non_na_rating = fuel_df[~fuel_df[rating_column].isna()]

    non_na_rating_class = non_na_rating.copy()
    na_rating_class = na_rating.copy()

    non_na_rating_class[rating_column] = non_na_rating_class[rating_column].astype(int)

    return non_na_rating_class, na_rating_class

def read_data(path):
    """
    This function reads data from csv files

    Parameters:
    ----------
        path: str
            path to data files

    Returns:
    -------
        fuel_df: pandas.DataFrame
            dataframe containing fuel cars data
        electric_df: pandas.DataFrame
            dataframe containing electric cars data
        hybrid_df: pandas.DataFrame
            dataframe containing hybrid cars data

    """
    
    # Fuel based cars
    file_name_2022_1995 = "1995_today_vehicle_fuel_consumption.csv"
    
    # Electric cars
    pure_electric = "Batteryelectric_vehicles__.csv"
    hybric_vehicle = "Plugin_hybrid_electric_vehicles__.csv"

    # Read data files
    fuel_df = pd.read_csv(Path(path ,f'{file_name_2022_1995}'))
    electric_df = pd.read_csv(Path(path ,f'{pure_electric}'))
    hybrid_df = pd.read_csv(Path(path ,f'{hybric_vehicle}'))

    return fuel_df, electric_df, hybrid_df



def train_and_evaluate_model(X_train, y_train, X_test, y_test, model_pipeline, model_name):
    
    model_pipeline.fit(X_train, y_train.values.ravel())


    # Predict
    y_pred = model_pipeline.predict(X_test)
    
    # Obtain accuracy score
    acc = accuracy_score(y_test, y_pred)
    print('accuracy is',accuracy_score(y_pred,y_test))
    
    score_train = model_pipeline.score(X_train, y_train)
    score_test = model_pipeline.score(X_test, y_test)
    print('score for training set', score_train, 'score for testing set', score_test)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    print("Balanced accuracy score", balanced_accuracy)
    
    report = classification_report(y_test, y_pred)
    
    fig, ax = plt.subplots(figsize=(10, 5))
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
    _ = ax.set_title(
        f"Confusion Matrix for {model_name}"
    )
    
    plt.show()
    
    print(report, sep=',')

def classify_grid_search_cv_tuning(model, parameters, X_train, X_test, y_train, y_test, n_folds = 5, scoring='accuracy'):
    """
    This function tunes GridSearchCV model
    
    Parameters:
    ----------
        model
        parameters
        X_train
        X_test
        y_train
        y_test
        n_folds
        scoring
        
    Returns:
    --------
        best_model
        best_score
    """
    # Set up and fit model
    tune_model = GridSearchCV(model, param_grid=parameters, cv=n_folds, scoring=scoring)
    tune_model.fit(X_train, y_train)
    
    best_model = tune_model.best_estimator_
    best_score = tune_model.best_score_
    y_pred = best_model.predict(X_test)
    
    # Printing results
    print("Best parameters:", tune_model.best_params_)
    print("Cross-validated accuracy score on training data: {:0.4f}".format(tune_model.best_score_))
    print()

    print(classification_report(y_test, y_pred))
    
    return best_model, best_score

In [3]:
fuel_df, electric_df, hybrid_df = read_data('../data/clean-data/')

Hybrid vehicles

In [4]:
non_na_rating_class, na_rating_class = remove_missing_values(hybrid_df)

In [34]:
fuel_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26961 entries, 0 to 26960
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   model_year                     26961 non-null  int64  
 1   make_                          26961 non-null  object 
 2   model.1_                       26961 non-null  object 
 3   vehicleclass_                  26961 non-null  object 
 4   enginesize_(l)                 26961 non-null  float64
 5   cylinders_                     26961 non-null  float64
 6   transmission_                  26961 non-null  object 
 7   fuel_type                      26961 non-null  object 
 8   fuelconsumption_city(l/100km)  26961 non-null  float64
 9   fuelconsumption_hwy(l/100km)   26961 non-null  float64
 10  fuelconsumption_comb(l/100km)  26961 non-null  float64
 11  fuelconsumption_comb(mpg)      26961 non-null  int64  
 12  co2emissions_(g/km)            26961 non-null 

In [8]:
# Var list
hybrid_var_list = ['vehicleclass_','make_',
                    'model.1_','model_year',
                    'cylinders_','fuelconsumption_city(l/100km)',
                    'fuelconsumption_hwy(l/100km)',
                    'fuelconsumption_comb(l/100km)',
                    'fuel_type1',
                    'fuel_type2',
                    'co2emissions_(g/km)',
                    'number_of_gears']

# Set up parameters for the model - numerical and categorical
numeric_features =  ['model_year','cylinders_',
                    'fuelconsumption_city(l/100km)',
                    'fuelconsumption_hwy(l/100km)',
                    'fuelconsumption_comb(l/100km)',
                    'co2emissions_(g/km)','number_of_gears']
categorical_features = ['vehicleclass_']

# Set up numerical and categorical transformers
numeric_transformer = Pipeline(
                            steps=[("scaler", StandardScaler())]
                        )

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

# Set up preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        #("cat", categorical_transformer, categorical_features),
    ]
)


In [31]:
from sklearn.impute import KNNImputer

# Independent variables
X = hybrid_df[numeric_features + ['co2_rating']]

missing_values_index = hybrid_df[hybrid_df.isnull().any(axis=1)].index

imputer = KNNImputer(n_neighbors=1)
X_imputed = imputer.fit_transform(X)

X_imputed[missing_values_index, -1]

hybrid_df.loc[missing_values_index, 'co2_rating'] =  X_imputed[missing_values_index, -1]

# save the imputed data
hybrid_df.to_csv('../data/clean-data/hybrid_imputed.csv', index=False)


In [35]:
hybrid_df

Unnamed: 0,model_year,make_,model.1_,vehicleclass_,motor_(kw),enginesize_(l),cylinders_,transmission_,fuel_type1,consumption_combinedle/100km,...,fuelconsumption_comb(l/100km),range2_(km),co2emissions_(g/km),co2_rating,transmission_type,number_of_gears,mapped_fuel_type,hybrid_fuels,id,vehicle_type
0,2012,chevrolet,volt,compact,111,1.4,4.0,AV,B,2.5 (22.3 kWh/100 km),...,6.4,550,54,10.0,continuously variable,0.0,premium gasoline,electricity,1,hybrid
1,2013,chevrolet,volt,compact,111,1.4,4.0,AV,B,2.4 (21.4 kWh/100 km),...,6.4,550,45,10.0,continuously variable,0.0,premium gasoline,electricity,2,hybrid
2,2013,ford,c-max energi,mid-size,35,2.0,4.0,AV,B/X,2.7 ([23.2 kWh + 0.1 L]/100 km),...,6.1,856,80,10.0,continuously variable,0.0,regular gasoline,electricity & regular gasoline,3,hybrid
3,2013,ford,fusion energi,mid-size,35,2.0,4.0,AV,B/X,2.7 ([23.2 kWh + 0.1 L]/100 km),...,6.1,856,80,10.0,continuously variable,0.0,regular gasoline,electricity & regular gasoline,4,hybrid
4,2013,toyota,prius plug-in hybrid,mid-size,60,1.8,4.0,AV,B/X,2.5 ([18.0 kWh + 0.4 L]/100 km),...,4.7,845,101,10.0,continuously variable,0.0,regular gasoline,electricity & regular gasoline,5,hybrid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,2023,volvo,s60 t8 awd recharge,compact,107,2.0,4.0,AS8,B/Z*,3.0 ([27.2 kWh + 0.0 L]/100 km),...,7.6,792,58,10.0,automatic with select Shift,8.0,premium gasoline,electricity & premium gasoline,240,hybrid
240,2023,volvo,s90 t8 awd recharge,mid-size,107,2.0,4.0,AS8,B/Z*,3.4 ([30.0 kWh + 0.0 L]/100 km),...,8.1,748,65,10.0,automatic with select Shift,8.0,premium gasoline,electricity & premium gasoline,241,hybrid
241,2023,volvo,v60 t8 awd recharge,station wagon - small,107,2.0,4.0,AS8,B/Z*,3.0 ([27.2 kWh + 0.0 L]/100 km),...,7.6,792,58,10.0,automatic with select Shift,8.0,premium gasoline,electricity & premium gasoline,242,hybrid
242,2023,volvo,xc60 t8 awd recharge,suv - small,107,2.0,4.0,AS8,B/Z*,3.5 ([31.2 kWh + 0.0 L]/100 km),...,8.5,838,72,10.0,automatic with select Shift,8.0,premium gasoline,electricity & premium gasoline,243,hybrid


In [32]:
num_e = ['model_year','consumption_city(kwh/100km)',
            'fuelconsumption_hwy(kwh/100km)', 'fuelconsumption_comb(kwh/100km)',
            'fuelconsumption_city(le/100km)', 'fuelconsumption_hwy(le/100km)',
            'fuelconsumption_comb(le/100km)','recharge_time(h)',
            'co2emissions_(g/km)']

X_e = electric_df[num_e + ['co2_rating']]

missing_values_index_e = electric_df[electric_df.isnull().any(axis=1)].index
imputer_e = KNNImputer(n_neighbors=1)
X_imputed_e = imputer.fit_transform(X)

electric_df.loc[missing_values_index_e, 'co2_rating'] =  X_imputed_e[missing_values_index_e,-1]

In [30]:
imputer_e = KNNImputer(n_neighbors=3)
X_imputed_e = imputer.fit_transform(X)

X_imputed_e[missing_values_index_e,-1]

array([10., 10., 10., 10., 10., 10., 10., 10., 10.,  9., 10., 10.,  9.,
       10., 10., 10.,  7.,  8.,  9., 10., 10., 10., 10., 10.,  9.,  8.,
       10., 10., 10., 10., 10., 10.,  9.,  8.,  9.,  8., 10., 10.,  9.,
       10., 10.,  8., 10., 10., 10., 10., 10., 10., 10., 10.,  7.,  9.,
        8., 10.,  8., 10., 10., 10., 10.,  9., 10.])

In [None]:
hybrid_df

plt.hist(x=hybrid_df['co2_rating'])

In [None]:
import seaborn as sns
plt.hist(x=non_na_rating_class['co2_rating'])

In [None]:
from xgboost import XGBClassifier
names = [
    "Random Forest",
]

classifiers = [
    RandomForestClassifier(max_depth=100, n_estimators=10, max_features=1, random_state=42),
]
params={}

for name, clf in zip(names, classifiers):
    print(name)
    
    model = Pipeline(
        steps=[("preprocessor", preprocessor), (name, clf, )] #colsample  by tree, n estimators, max depth
                                                                    )
    train_and_evaluate_model(X_train, y_train, X_test, y_test, model,name)
    classify_grid_search_cv_tuning(model, params, X_train, X_test, y_train, y_test, n_folds=5, scoring='f1_weighted')
    
    print("----------------")

In [None]:
model = Pipeline(
        steps=[("preprocessor", preprocessor), ("XG", XGBClassifier(use_label_encoder=False, eval_metric='logloss'), )] #colsample  by tree, n estimators, max depth
)
params={}
best_xgb, xgb_score = classify_grid_search_cv_tuning(model, params, X_train, X_test, y_train, y_test, n_folds=5, scoring='f1_weighted');
