In [1]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight
import joblib
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [7]:
# CatBoost & load model
catboost_model = CatBoostClassifier()
catboost_model.load_model('../model/catboost_model')

# Grab importance
cb_df = pd.DataFrame(catboost_model.get_feature_importance(prettified=True))

cb_df.rename(
    columns={
        'Feature Id': 'Feature',
        'Importances': 'Importance'
    }, inplace = True
)

# Show user
cb_df

Unnamed: 0,Feature,Importance
0,AIRLINE,20.807725
1,ORIGIN_AIRPORT,19.695691
2,Scheduled Arrival Time,17.331703
3,DAY_OF_WEEK,16.350858
4,DESTINATION_AIRPORT,14.846843
5,DISTANCE,10.401902
6,Scheduled Hour,0.565278
7,MONTH,0.0


In [None]:
# Plot to compare
cb_fig = px.bar(cb_df, x='Feature', y='Importance',
                title='CatBoost Model Feature Importance')

# Show
cb_fig.show()

In [None]:
# Random Forest Model
random_forest_model = joblib.load('../model/random_forest_model.joblib')

# Get feature names from the model
rfm_feature = random_forest_model.feature_names_in_

# Get importance values
rfm_importance = random_forest_model.feature_importances_

rfm_df = pd.DataFrame({
    'Feature': rfm_feature,
    'Importance': rfm_importance
})

# Sort by importance (descending)
rfm_df = rfm_df.sort_values('Importance', ascending=False)

# Show user
rfm_df



Trying to unpickle estimator DecisionTreeClassifier from version 1.6.1 when using version 1.7.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Trying to unpickle estimator RandomForestClassifier from version 1.6.1 when using version 1.7.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations



Unnamed: 0,Feature,Importance
4,Scheduled Arrival Time,0.250286
1,DAY_OF_WEEK,0.228204
2,DISTANCE,0.081183
3,Scheduled Hour,0.016098
632,AIRLINE_DL,0.010110
...,...,...
434,DESTINATION_AIRPORT_GFK,0.000007
13,ORIGIN_AIRPORT_ADK,0.000006
14,ORIGIN_AIRPORT_ADQ,0.000004
325,DESTINATION_AIRPORT_ADK,0.000003


In [None]:
# Plot to compare
rfm_fig = px.bar(rfm_df, x='Feature', y='Importance',
                title='Random Forest Model Feature Importance')

# Show user
rfm_fig.show()

In [None]:
# Limit it to top ten
rfm_fig = px.bar(rfm_df.head(10), x='Feature', y='Importance',
                title='Random Forest Model Feature Importance')

# Show user
rfm_fig.show()

In [None]:
# Initialize
aggregated = {}

# Grab the feature and importance and replace then with one to be combined
for column, row in rfm_df.iterrows():
    feature = row['Feature']
    importance = row['Importance']
    if feature.startswith('AIRLINE_'):
        category = 'AIRLINE'
    elif feature.startswith('ORIGIN_AIRPORT_'):
        category = 'ORIGIN_AIRPORT'
    elif feature.startswith('DESTINATION_AIRPORT_'):
        category = 'DESTINATION_AIRPORT'
    else:
        category = feature
    
    # Sum and combine
    aggregated[category] = aggregated.get(category, 0) + importance

# Turn to dataframe and sort by importance
aggregated = pd.DataFrame(
    list(aggregated.items()), 
    columns=['Feature', 'Importance']).sort_values('Importance', ascending=False)

# Show user
aggregated

Unnamed: 0,Feature,Importance
0,Scheduled Arrival Time,0.250286
1,DAY_OF_WEEK,0.228204
5,ORIGIN_AIRPORT,0.188267
6,DESTINATION_AIRPORT,0.176437
2,DISTANCE,0.081183
4,AIRLINE,0.059526
3,Scheduled Hour,0.016098
7,MONTH,0.0


In [None]:
# Plot to compare
rfm_agg_fig = px.bar(aggregated, x='Feature', y='Importance',
                title='Random Forest Model Feature Importance (Aggregated)')

# Show user
rfm_agg_fig.show()

In [None]:
# Set indexes
aggregated_indexed = aggregated.set_index('Feature')
cb_df_indexed = cb_df.set_index('Feature')

# Align them
aggregated, cb_df = aggregated_indexed.align(cb_df_indexed, join = 'outer', axis = 0)

# Reset the inedx
aggregated.reset_index(inplace = True)
cb_df.reset_index(inplace = True)

In [None]:
# Make a comparison
compare_fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add the aggregated Random Forest Model
compare_fig.add_trace(
    go.Bar(x=aggregated['Feature'], y=aggregated['Importance'],
           opacity = 0.7, name = 'Random Forest Model'),
    secondary_y=False
)

# Add the CatBoost Model
compare_fig.add_trace(
    go.Bar(x=cb_df['Feature'], y=cb_df['Importance'],
           opacity = 0.7, name = 'CatBoost Model'),
    secondary_y=True
)

# Add titles and sort
compare_fig.update_layout(
    title_text = 'Comparison of Feature Importance Between Random Forest Model and CatBoost Model',
    xaxis = {'categoryorder': 'total descending'}
)


# Show user
compare_fig.show()