In [None]:
%run "Supervised_Learning.ipynb"

Prepared Data for Robustness Checks from random sample of banks + the 3 pre-designated banks

In [None]:
# Set display options to show all columns
pd.set_option('display.max_columns', None)

# Path to your Feather file
file_path = 'df_samples.feather'

# Read the Feather file into a DataFrame
df_samples = pd.DataFrame(feather.read_feather(file_path))

df_samples['category'] = 'intermediate'

df_samples



Though there was very little difference between the models, ultimately Random Forest Regression was found to be the best model. Using previously preprocessed data from before we will test the robustness of the model against the banks we previously set aside. 

Run each specific bank through the model.

In [None]:
df_samples = df_samples.loc[:, ['name'
                        , 'category'
                        , 'date'
                        , 'year'
                        , 'quarter'
                        , 'total_deposits'
                        , 'cpi'
                        , 'fed_fund_rate'
                        , 'housing_starts'
                        , 'personal_savings_rate'
                        , 'umich_consumer_sentiment_index']]
df_samples

In [None]:
lagged_df_samples = df_samples.groupby('name').apply(lambda x: x.assign(total_deposits_lag1=x['total_deposits'].shift(1)))

# Drop NaN values
lagged_df_samples = lagged_df_samples.dropna().reset_index(drop=True)

# Display the lagged DataFrame
lagged_df_samples 

In [None]:
selected_features=['cpi','fed_fund_rate','housing_starts','personal_savings_rate','umich_consumer_sentiment_index']

Consider Testing on Multiple Models for Robustness - each model is similar 

In [None]:
# Robustness Test on Random Forest Model
random_forest={}

test_X = lagged_df_samples[selected_features]
test_y = lagged_df_samples['total_deposits']

for category, info in best_models_random_forest.items():
    model = info['model']
    print(info)
    
    # Filter test_X and test_y by category
    test_X = lagged_df_samples[lagged_df_samples['category'] == category][selected_features]
    test_y = lagged_df_samples[lagged_df_samples['category'] == category]['total_deposits']
    
    # Make predictions only if there is data for the category
    if not test_X.empty:
        predictions = model.predict(test_X)
    
        rmse = mean_squared_error(test_y, predictions, squared=False)
        r2 = r2_score(test_y, predictions)       


    random_forest[category] = { 'rmse': rmse, 'r2': r2}

random_forest


In [None]:
categories = lagged_df['category'].unique()

rmse_values = {
    #'Lasso': [lasso[cat]['rmse'] for cat in categories],
  #  'Ridge': [ridge[cat]['rmse'] for cat in categories],
    'Random Forest': [random_forest[cat]['rmse'] for cat in categories],
  #  'Gradient Boosting': [gradient_boosting[cat]['rmse'] for cat in categories]
}

# Create bar chart
fig, ax = plt.subplots(figsize=(12, 6))
bar_width = 0.2
index = range(len(categories))

for i, (method, rmse) in enumerate(rmse_values.items()):
    ax.bar([x + i * bar_width for x in index], rmse, bar_width, label=method)

ax.set_xlabel('Category')
ax.set_ylabel('Root Mean Squared Error (RMSE)')
ax.set_title('RMSE by Regression Method and Category on Sample Data for Robustness Check')
ax.set_xticks([x + bar_width for x in index])
ax.set_xticklabels(categories)
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
best_models_random_forest

Conclusion: Written once we figure out results. 

Sensitivity Analysis - Done on Random Forest Regression 

Parameters adjusted will be n_estimators and/or max_depth

In [None]:
X = df[selected_features]
y = df['total_deposits'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
n_estimators_range = [10, 50, 100, 150, 200]
max_depth_range = [None, 10, 20, 30]

results = []

for n_estimators in n_estimators_range:

    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=10, random_state=42)  # Keeping max_depth constant
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    rmse = mean_squared_error(y_test, predictions, squared=False)
    r2 = r2_score(y_test, predictions)
    results.append({'n_estimators': n_estimators, 'max_depth': 10, 'RMSE': rmse, 'R2': r2})

# Repeat for max_depth variation
for max_depth in max_depth_range:
    model = RandomForestRegressor(n_estimators=100, max_depth=max_depth, random_state=42)  # Keeping n_estimators constant
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    rmse = mean_squared_error(y_test, predictions, squared=False)
    r2 = r2_score(y_test, predictions)
    results.append({'n_estimators': 100, 'max_depth': max_depth, 'RMSE': rmse, 'R2': r2})

results_df = pd.DataFrame(results)


In [None]:
sns.lineplot(data=results_df, x='n_estimators', y='RMSE', marker='o', label='RMSE vs. n_estimators')
plt.title("Change in RMSE based on Adjustments to n_estimators")
plt.legend()
plt.show()

More trees (n_estimators) beneficial up to ~100 where no longer benefical, with little to no improvement. 

In [None]:
sns.lineplot(data=results_df, x='max_depth', y='RMSE', marker='o', label='RMSE vs. max_depth')
plt.title("Change in RMSE based on Adjustments to max_depth")
plt.legend()
plt.show()

No longer sensitive to changes once max_depth > 20