In [2]:
import sys, os
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_samples, silhouette_score

import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV
from pathlib import Path

from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import numpy as np

from sklearn import metrics
import xgboost as xgb
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

# Get the current working directory
current_working_directory = os.getcwd()

# Convert the current working directory to a Path object
script_dir = Path(current_working_directory)

def merge_vehicle_id(df):
    df['vehicle_id'] = df['vehicle_type'] + '_' + df['id'].astype(str)

    df.drop(['id'], axis=1, inplace=True)
    return df

def read_and_clean_vehicle_data(clean_data_path):
    """
    
    This function reads the cleaned data from the data/processed folder and returns a dataframe

    Parameters
    ----------
    clean_data_path : Path
        Path to the data/processed folder

    Returns
    ------- 
    fuel_df : DataFrame
        A dataframe containing the cleaned fuel data
    hybrid_df : DataFrame
        A dataframe containing the cleaned hybrid data
    electric_df : DataFrame
        A dataframe containing the cleaned electric data
    """

    # Read the data
    fuel_df = pd.read_csv(Path(clean_data_path,"predicted_co2_rating.csv"))
    hybrid_df = pd.read_csv(Path(clean_data_path,"predicted_co2_rating_hybrid.csv"))
    electric_df = pd.read_csv(Path(clean_data_path,"predicted_co2_rating_electric.csv"))

    # drop the 'original_co2r' column from the dataframes
    fuel_df.drop('original_co2r', axis=1, inplace=True)

    # rename co2_rating to predicted_co2r in hybrid_df and electric_df
    hybrid_df.rename(columns={'co2_rating': 'predicted_co2_rating'}, inplace=True)
    electric_df.rename(columns={'co2_rating': 'predicted_co2_rating'}, inplace=True)

    # rename fuel_type1 to fuel_type in fuel_df, hybrid_df, and electric_df
    fuel_df.rename(columns={'fuel_type1': 'fuel_type'}, inplace=True)
    hybrid_df.rename(columns={'fuel_type1': 'fuel_type'}, inplace=True)
    electric_df.rename(columns={'fuel_type1': 'fuel_type'}, inplace=True)

    return fuel_df, hybrid_df, electric_df

def concat_vehicle_data(fuel_df, hybrid_df, electric_df):

    df = pd.concat([fuel_df, hybrid_df, electric_df], axis=0, ignore_index=True)


    # move vehicle_id to the first column
    cols = df.columns.tolist()

    # obtain position of vehicle_id column from cols
    vehicle_id_index = cols.index('vehicle_id')

    # move vehicle_id to the first column
    cols = [cols[vehicle_id_index]] + cols[:vehicle_id_index] + cols[vehicle_id_index+1:]

    df = df[cols]

    return df

def prepare_data_feature_importances(df):
    # Select the columns with numerical and categorical data
    num_cols = df.select_dtypes(include=np.number).columns

    # One-hot encode the categorical columns
    df_cat = pd.get_dummies(df[['vehicle_type','mapped_fuel_type']])
    df_num = df[num_cols]

    # Combine the numerical and one-hot encoded categorical dataframes
    df_processed = pd.concat([df_num, df_cat], axis=1)

    return df_processed

def generate_feature_importances(df_processed, reports_path):

    # Perform recursive feature elimination to select the most important features
    target_col = 'predicted_co2_rating'
    X = df_processed.drop([target_col], axis=1)
    y = df_processed[target_col]

    # set up xgboost model for classification and feature selection
    dt = DecisionTreeClassifier(random_state=42)
    rfecv = RFECV(estimator=dt, cv=10)
    rfecv.fit(X, y)

    # predict
    y_pred = rfecv.predict(X)

    # Print the accuracy
    print("Accuracy:", metrics.accuracy_score(y, y_pred))

    # Print the most important features
    print("Selected features:", X.columns[rfecv.support_])

    # Fit a decision tree and get feature importances
    dt.fit(X, y)
    feat_importances = pd.Series(dt.feature_importances_, index=X.columns)

    # Plot the feature importances
    plt.figure(figsize=(10,6))
    plt.title(f'Decision Tree Feature Importances - variable: {target_col}')
    feat_importances.nlargest(15).plot(kind='barh')

    # save the plot
    plt.savefig(f'{reports_path}/feature_importances_{target_col.split("(")[0]}.png')

    # Select the top 5 most important features
    threshold = feat_importances.nlargest(15).min()
    top_features = feat_importances[feat_importances >= threshold].index.tolist()
    print("Top features:", top_features)


if __name__ == "__main__":

    # Get the current working directory
    sys.path.append(os.path.abspath(os.path.join('..','./data/', './clean-data/')))
    sys.path.append(os.path.abspath(os.path.join('..','./scripts/')))
    sys.path.append(os.path.abspath(os.path.join('..','./reports/')))

    predicted_data_path =os.path.abspath(os.path.join(os.getcwd(), "..",'data', 'predicted-data'))



    # Load data
    fuel_df, hybrid_df, electric_df = read_and_clean_vehicle_data(predicted_data_path)

    # merge vehicle_type and id into a single column for fuel_df
    fuel_df = merge_vehicle_id(fuel_df)

    # set fuel_df to contain only cars from 2012 onwards
    fuel_df = fuel_df[fuel_df['model_year'] >= 2012]
    hybrid_df = merge_vehicle_id(hybrid_df)
    electric_df = merge_vehicle_id(electric_df)

    # Concatenate the dataframes
    df = concat_vehicle_data(fuel_df, hybrid_df, electric_df)

    # fill na
    df.fillna(0, inplace=True) 
    # convert categorical columns to a categorical variable
    cat_cols = df.select_dtypes(exclude=np.number).columns
    num_cols = df.select_dtypes(include=np.number).columns
    for col in cat_cols:
        df[col] = df[col].astype('category')


In [8]:
for col in num_cols:
    df[col] = df[col].fillna(0)