In [None]:
import pandas as pd 
import general_utils
import holidays
import matplotlib.pyplot as plt
import os
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np


%load_ext autoreload
%autoreload 2

In [None]:
directory = 'C:/Users/maran/OneDrive/Documents/Git Profile/Data-Projects/CAVU/raw_data'
bookings_sample_file = 'bookings_sample (1).csv'
flights_sample_file = 'flights_sample.csv'
bookings_sample_path = directory + '/' + bookings_sample_file
flights_sample_path = directory + '/' + flights_sample_file
final_bookings_file = 'final_bookings_df.csv'
csv_directory = 'temp_csv'
os.makedirs(csv_directory, exist_ok=True)
path_to_final_bookings = csv_directory + final_bookings_file

car_park_columns = ['carpark1', 'carpark2', 'carpark3', 'carpark4', 'carpark5', 'carpark6', 'carpark7']

# Load and clean data

In [None]:
raw_bookings_df = pd.read_csv(bookings_sample_path)
raw_flights_df = pd.read_csv(flights_sample_path)

## Flights data Prep

In [None]:
flights_df = raw_flights_df.copy(deep=True)
flights_df.head()

In [None]:
flights_df.isna().sum()

In [None]:
flights_df['destination_airport_name_continent'].unique()

In [None]:
flights_df[flights_df['destination_airport_name_continent'] == 'Unknown Continent']

flights_df['flight_number_full_icao'] = flights_df['flight_number_full_icao'].str.upper()
flights_df['is_international'] = flights_df['destination_airport_dom_intern'] == 'International'
flights_df['is_europe'] = flights_df['destination_airport_name_continent'] == 'Europe'
flights_df['is_asia'] = flights_df['destination_airport_name_continent'] == 'Asia'

manohar_rows = flights_df['destination_airport_name'].str.contains('Manohar', na=False)
flights_df.loc[manohar_rows, 'is_asia'] = True

flights_df['is_north_america'] = flights_df['destination_airport_name_continent'] == 'North America'
flights_df['is_africa'] = flights_df['destination_airport_name_continent'] == 'Africa'

flights_df.shape

In [None]:
flight_min_date = flights_df['flight_date'].min()
flight_max_date = flights_df['flight_date'].max()

print(f"Flight Date Range: {flight_min_date} to {flight_max_date}")

## Bookings data prep

In [None]:
raw_booking_records = raw_bookings_df.shape[0]
raw_booking_records

In [None]:
raw_bookings_df.isna().sum()

In [None]:
raw_bookings_df['started_at'].min(), raw_bookings_df['started_at'].max(), raw_bookings_df['created_at'].min(), raw_bookings_df['created_at'].max(), raw_bookings_df['closed_at'].min(), raw_bookings_df['closed_at'].max()

In [None]:
raw_bookings_df[raw_bookings_df['created_at'] == '2018-09-09 15:50:21.487'].shape
raw_bookings_df[raw_bookings_df.duplicated(subset=['created_at', 'amount'])].head(2)

In [None]:
raw_bookings_df[raw_bookings_df.duplicated(subset=['created_at', 'outbound_flight_no'])]

In [None]:
bookings_df = raw_bookings_df.copy(deep=True)
bookings_df.head()

In [None]:
bookings_df['booking_status'].value_counts()

In [None]:
confirmed_booking = bookings_df['booking_status'] == 'Confirmed'

same_time_period = bookings_df['created_at'].between(flight_min_date, flight_max_date, inclusive='both')

bookings_df.loc[:, 'to_keep'] = confirmed_booking & same_time_period

bookings_df['to_keep'].fillna(False, inplace=True)

filtered_bookings_df = bookings_df[bookings_df['to_keep']].drop(columns=['to_keep'])

filtered_bookings_df.head()

In [None]:
filtered_bookings_df[~filtered_bookings_df['started_at'].astype(str).str.endswith('+00')].shape, filtered_bookings_df[~filtered_bookings_df['closed_at'].astype(str).str.endswith('+00')].shape

In [None]:
filtered_bookings_df['created_at'] = pd.to_datetime(filtered_bookings_df['created_at'], errors='coerce').dt.strftime('%Y-%m-%d %H:%M:%S')
filtered_bookings_df['started_at'] = pd.to_datetime(filtered_bookings_df['started_at'], format='%Y-%m-%d %H:%M:%S%z', errors='coerce').dt.strftime('%Y-%m-%d %H:%M:%S')
filtered_bookings_df['closed_at'] = pd.to_datetime(filtered_bookings_df['closed_at'], format='%Y-%m-%d %H:%M:%S%z', errors='coerce').dt.strftime('%Y-%m-%d %H:%M:%S')
filtered_bookings_df['is_discount_booking'] = filtered_bookings_df['is_discount_booking'].astype(bool)
filtered_bookings_df['booking_status'] = filtered_bookings_df['booking_status'].astype('category')

filtered_bookings_df['flight_date'] = pd.to_datetime(filtered_bookings_df['started_at'], errors='coerce').dt.strftime('%Y-%m-%d')

filtered_bookings_df['created_at_dt'] = pd.to_datetime(filtered_bookings_df['created_at'], errors='coerce')
filtered_bookings_df['started_at_dt'] = pd.to_datetime(filtered_bookings_df['started_at'], errors='coerce')
filtered_bookings_df['closed_at_dt'] = pd.to_datetime(filtered_bookings_df['closed_at'], errors='coerce')

filtered_bookings_df['parking_duration_days'] = (filtered_bookings_df['closed_at_dt'] - filtered_bookings_df['started_at_dt']).dt.days
filtered_bookings_df['lead_time_days'] = (filtered_bookings_df['started_at_dt'] - filtered_bookings_df['created_at_dt']).dt.days

filtered_bookings_df['flight_number_full_icao'] = filtered_bookings_df['outbound_flight_no'].str.upper()

filtered_bookings_df.head()


In [None]:
# Analysis for bookings_df
nan_terminal_records_bookings = bookings_df[bookings_df['terminal'].isnull()].shape[0]
total_raw_booking_records_bookings = bookings_df.shape[0]
nan_terminal_percentage_bookings = (nan_terminal_records_bookings / total_raw_booking_records_bookings) * 100
total_records_to_keep_bookings = bookings_df[bookings_df['to_keep']].shape[0]

print("bookings_df:")
print(f"  Total records: {total_raw_booking_records_bookings}")
print(f"  Records to keep: {total_records_to_keep_bookings}")
print(f"  NaN 'terminal': {nan_terminal_records_bookings}")
print(f"  % NaN 'terminal': {nan_terminal_percentage_bookings:.2f}%")
print(f"  NaN 'terminal' & to_keep: {bookings_df[bookings_df['terminal'].isnull() & bookings_df['to_keep']].shape[0]}")
print(f"  % NaN 'terminal' & to_keep: {(bookings_df[bookings_df['terminal'].isnull() & bookings_df['to_keep']].shape[0] / total_records_to_keep_bookings) * 100:.2f}%")

# Analysis for filtered_bookings_df
nan_terminal_records_filtered = filtered_bookings_df[filtered_bookings_df['terminal'].isnull()].shape[0] if 'terminal' in filtered_bookings_df.columns else 0
total_raw_booking_records_filtered = filtered_bookings_df.shape[0]
nan_terminal_percentage_filtered = (nan_terminal_records_filtered / total_raw_booking_records_filtered) * 100 if total_raw_booking_records_filtered > 0 else 0

print("\nfiltered_bookings_df:")
print(f"  Total records: {total_raw_booking_records_filtered}")
print(f"  NaN 'terminal': {nan_terminal_records_filtered}")
print(f"  % NaN 'terminal': {nan_terminal_percentage_filtered:.2f}%")

In [None]:
has_terminal_bookings_df = filtered_bookings_df[~filtered_bookings_df['terminal'].isna()]

missing_terminal_bookings_df = filtered_bookings_df[filtered_bookings_df['terminal'].isna()]#.rename(columns={'outbound_flight_no': 'flight_number_full_icao'})

required_flight_cols = ['flight_number_full_icao', 'terminal_name', 'flight_date']

recovered_terminal_flights_df = missing_terminal_bookings_df.merge(flights_df[required_flight_cols], on=['flight_number_full_icao', 'flight_date'], how='left')

recovered_terminal_flights_df = recovered_terminal_flights_df.drop(columns=['terminal']).rename(columns={'terminal_name': 'terminal'})

bookings_validated_df = pd.concat([has_terminal_bookings_df, recovered_terminal_flights_df], ignore_index=True)

# bookings_validated_df['terminal'] = bookings_validated_df['terminal'].fillna('Unknown')

bookings_validated_df.shape, bookings_df.shape

In [None]:
final_bookings_df = bookings_validated_df.melt(
    id_vars=[c for c in bookings_validated_df.columns if c not in car_park_columns],
    value_vars=car_park_columns,
    var_name="carpark_name",
    value_name="is_carpark"
)

final_bookings_df = final_bookings_df[(final_bookings_df['is_carpark'] == 1).drop(columns=['is_carpark'])].reset_index(drop=True)
final_bookings_df.head()

# Aggregate for features

In [None]:
# Precompute normalized dates CORRECT AND SLIGHTLY FASTER
starts = final_bookings_df['started_at_dt'].dt.normalize()
ends = final_bookings_df['closed_at_dt'].dt.normalize()

# Generate date ranges using list comprehension (faster than apply)
final_bookings_df['occupancy_days'] = [
    pd.date_range(s, e, freq='D') 
    for s, e in zip(starts, ends)
]

if os.path.exists(path_to_final_bookings) and 'final_bookings_exploded_df' not in globals():
    print("Loading final_bookings_df from CSV...")
    final_bookings_exploded_df = pd.read_csv(path_to_final_bookings, parse_dates=['started_at_dt', 'closed_at_dt', 'date_occupied'])
else:
    print("Generating final_bookings_exploded_df and saving to CSV...")
    final_bookings_exploded_df = final_bookings_df.explode('occupancy_days').rename(columns={'occupancy_days': 'date_occupied'})
    final_bookings_exploded_df.to_csv(path_to_final_bookings, index=False)

# bookings_wrong_df = general_utils.expand_booking_dates_fast(final_bookings_df)


In [None]:
bookings_df

In [None]:
daily_bookings_df = final_bookings_exploded_df.groupby(['date_occupied', 'terminal', 'carpark_name']).agg(**{
    'occupancy': ('date_occupied', 'count'),
    'total_revenue': ('amount', 'sum'),
    'bookings_count': ('amount', 'size'),
    'avg_duration_days': ('parking_duration_days', 'mean'),
    'min_duration_days': ('parking_duration_days', 'min'),
    'avg_lead_time_days': ('lead_time_days', 'mean'),
    'min_lead_time_days': ('lead_time_days', 'min'),
    'discounted_bookings': ('is_discount_booking', 'sum'),
    'discount_ratio': ('is_discount_booking', 'mean'),
    'cancel_ratio': ('booking_status', lambda x: (x == 'canceled').mean())
}).reset_index()

daily_bookings_df['dow'] = pd.to_datetime(daily_bookings_df['date_occupied']).dt.dayofweek
daily_bookings_df['month'] = pd.to_datetime(daily_bookings_df['date_occupied']).dt.month
daily_bookings_df['is_weekend'] = daily_bookings_df['dow'].isin([5, 6]).astype(int)

daily_bookings_df

In [None]:
flights_agg_df = flights_df.groupby(['flight_date', 'terminal_name']).agg(**{
    'total_flights': ('flight_number_full_icao', 'nunique'),
    'total_passengers': ('pax_quantity', 'sum'),
    'intl_flights': ('is_international', 'sum'),
    'intl_pax': ('pax_quantity', lambda x: x[flights_df.loc[x.index, 'is_international'] == 1].sum()),
    'europe_pax': ('pax_quantity', lambda x: x[flights_df.loc[x.index, 'is_europe'] == True].sum()),
    'asia_pax': ('pax_quantity', lambda x: x[flights_df.loc[x.index, 'is_asia'] == True].sum()),
    'na_pax': ('pax_quantity', lambda x: x[flights_df.loc[x.index, 'is_north_america'] == True].sum()),
    'africa_pax': ('pax_quantity', lambda x: x[flights_df.loc[x.index, 'is_africa'] == True].sum()),
}).reset_index()

flights_agg_df['flight_date'] = pd.to_datetime(flights_agg_df['flight_date'])

flights_agg_df['intl_share'] = flights_agg_df['intl_pax'] / flights_agg_df['total_passengers']
flights_agg_df['europe_share'] = flights_agg_df['europe_pax'] / flights_agg_df['total_passengers']
flights_agg_df['asia_share'] = flights_agg_df['asia_pax'] / flights_agg_df['total_passengers']
flights_agg_df['na_share'] = flights_agg_df['na_pax'] / flights_agg_df['total_passengers']
flights_agg_df['africa_share'] = flights_agg_df['africa_pax'] / flights_agg_df['total_passengers']

uk_holidays = holidays.UnitedKingdom(years=range(2017, 2026))
flights_agg_df['holiday_flag'] = flights_agg_df['flight_date'].dt.date.astype('datetime64[ns]').isin(uk_holidays).astype(int)

flights_agg_df['weekofyear'] = flights_agg_df['flight_date'].dt.isocalendar().week

flights_agg_df

In [None]:
flights_agg_df['flight_date'].min(), flights_agg_df['flight_date'].max(), flights_df['flight_date'].min(), flights_df['flight_date'].max()

In [None]:
relevant_columns = ['date_occupied', 'terminal', 'carpark_name', 'occupancy','total_revenue', 'bookings_count', 'avg_duration_days', 'min_duration_days', 'avg_lead_time_days', 'min_lead_time_days',
       'discounted_bookings', 'discount_ratio', 'cancel_ratio', 'dow', 'month', 'is_weekend', 'flight_date', 'terminal_name', 'total_flights',
       'total_passengers', 'intl_flights', 'intl_share', 'europe_share', 'asia_share', 'na_share', 'africa_share', 'holiday_flag', 'weekofyear']

merged_df = daily_bookings_df.merge(flights_agg_df, left_on=['date_occupied', 'terminal'], right_on=['flight_date', 'terminal_name'], how='left')
filtered_dates_rows = merged_df['date_occupied'].between(flight_min_date, flight_max_date, inclusive='both')
master_df = merged_df.loc[filtered_dates_rows, relevant_columns]

master_df.shape, merged_df.shape

In [None]:
weekly_df = master_df.groupby(['weekofyear', 'terminal', 'carpark_name']).agg({
    'occupancy': 'sum',
    'total_revenue': 'sum',
    'bookings_count': 'sum',
    'avg_duration_days': 'mean',
    'min_duration_days': 'min',
    'avg_lead_time_days': 'mean',
    'min_lead_time_days': 'min',
    'discounted_bookings': 'sum',
    'discount_ratio': 'mean',
    'cancel_ratio': 'mean',
    'dow': 'first',
    'month': 'first',
    'is_weekend': 'first',
    'total_flights': 'sum',
    'total_passengers': 'sum',
    'intl_flights': 'sum',
    'intl_share': 'mean',
    'europe_share': 'mean',
    'asia_share': 'mean',
    'na_share': 'mean',
    'africa_share': 'mean',
    'holiday_flag': 'max',
    'flight_date': 'first',
    'terminal_name': 'first'
}).reset_index()

weekly_df.head()

# Explore Data

In [None]:
master_df.columns

In [None]:
sliced_flights_df = flights_df[flights_df['flight_date'].between(flight_min_date, flight_max_date, inclusive='both')].sort_values(by=['flight_date', 'terminal_name'])
sliced_flights_df['weekofyear'] = pd.to_datetime(sliced_flights_df['flight_date']).dt.isocalendar().week
sliced_flights_df

In [None]:
ax = sliced_flights_df.groupby(['weekofyear','terminal_name']).agg({'pax_quantity': 'sum'}).unstack().droplevel(level=0, axis=1).plot(
    kind='line', title='Weekly Passenger Count Over Time', figsize=(12,6)
)
ax.set_ylim(bottom=0)
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{int(x):,}"))
ax.grid(True, axis='y', linestyle='--', alpha=0.5)

lines = ax.get_lines()
if len(lines) > 1:
    lines[1].set_linestyle('--')

for q, label in zip([13, 26, 39, 52], ['Q1', 'Q2', 'Q3', 'Q4']):
    ax.axvline(x=q, color='gray', linestyle=':', linewidth=1)
    ax.text(q, ax.get_ylim()[1], label, color='gray', ha='center', va='top', fontsize=10)

    ax.set_ylabel('Outbound Passengers')
    ax.set_xlabel('Week of Year')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(14, 6))
for terminal in master_df['terminal'].unique():
    subset = master_df[master_df['terminal'] == terminal]
    avg_by_week = subset.groupby('weekofyear')['occupancy'].mean()
    plt.plot(avg_by_week.index, avg_by_week.values, label=terminal)

plt.xlabel('Week of Year')
plt.ylabel('Average Occupancy')
plt.title('Average Occupancy per Terminal per Week - 2023')
plt.legend()
plt.tight_layout()
plt.grid(True, axis='y', linestyle='--', alpha=0.5)
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{int(x):,}"))

for q in [13, 26, 39, 52]:
    plt.axvline(x=q, color='gray', linestyle=':', linewidth=1)
    plt.text(q, plt.gca().get_ylim()[1], f'Q{[13,26,39,52].index(q)+1}', color='gray', ha='center', va='top', fontsize=10)

plt.show()

In [None]:
ave_occupancy_per_carpark = master_df.groupby(['terminal', 'carpark_name','weekofyear'])['occupancy'].mean().unstack().fillna(0)

ave_occupancy_per_carpark

for terminal in master_df['terminal'].unique():
    subset = master_df[master_df['terminal'] == terminal]
    plt.figure(figsize=(12, 6))
    for carpark in subset['carpark_name'].unique():
        carpark_data = subset[subset['carpark_name'] == carpark].groupby('date_occupied')['occupancy'].mean()
        plt.plot(carpark_data.index, carpark_data.values, label=f"{carpark}")
    plt.xlabel('Date')
    plt.ylabel('Occupancy')
    plt.title(f'Occupancy Over Time - Terminal {terminal}')
    plt.legend()
    plt.grid(True)
    plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{int(x):,}"))
    plt.show()

# Train models

In [None]:
trial_df = weekly_df.copy(deep=True)
trial_df

# Sort by date
master_df_sorted = master_df.sort_values('date_occupied')

master_df_sorted = trial_df.sort_values(by=['weekofyear', 'terminal', 'carpark_name'])

start_idea = '2023-06-01'
end_idea = '2023-11-01'
# master_df_sorted = master_df_sorted[master_df_sorted['date_occupied'].between(start_idea, end_idea)]

# 70/30 split
split_idx = int(len(master_df_sorted) * 0.7)
train_df = master_df_sorted.iloc[:split_idx]
test_df = master_df_sorted.iloc[split_idx:]

# Features and target
feature_cols = [
    'total_flights', 'total_passengers', 'intl_share', 'europe_share', 'asia_share',
    'na_share', 'africa_share', 'holiday_flag', 'dow', 'month', 'is_weekend'
]
X_train = train_df[feature_cols].fillna(0)
y_train = train_df['occupancy']
X_test = test_df[feature_cols].fillna(0)
y_test = test_df['occupancy']

In [None]:
# Fit model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict
y_pred_lr = lr.predict(X_test)

# Metrics
r2_score_value_lr = r2_score(y_test, y_pred_lr)
rmse_value_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
mae_value_lr = mean_absolute_error(y_test, y_pred_lr)
mse_value_lr = mean_squared_error(y_test, y_pred_lr)

print(f"R2 Score: {r2_score_value_lr:.4f}")
print(f"RMSE: {rmse_value_lr:.2f}")
print(f"MAE: {mae_value_lr:.2f}")
print(f"MSE: {mse_value_lr:.2f}")

In [None]:
ts = master_df_sorted.groupby('weekofyear')['occupancy'].sum().sort_index()

# Train/test split (70/30)
split_idx = int(len(ts) * 0.7)
train_ts = ts.iloc[:split_idx]
test_ts = ts.iloc[split_idx:]

# Fit ARIMA model (simple order, can be tuned)
arima_model = ARIMA(train_ts, order=(1,1,1))
arima_result = arima_model.fit()

# Forecast
forecast = arima_result.forecast(steps=len(test_ts))
y_pred_arima = forecast.values
y_test_arima = test_ts.values

# Metrics
r2_arima = r2_score(y_test_arima, y_pred_arima)
rmse_arima = np.sqrt(mean_squared_error(y_test_arima, y_pred_arima))
mae_arima = mean_absolute_error(y_test_arima, y_pred_arima)
mse_arima = mean_squared_error(y_test_arima, y_pred_arima)

print(f"ARIMA R2 Score: {r2_arima:.4f}")
print(f"ARIMA RMSE: {rmse_arima:.2f}")
print(f"ARIMA MAE: {mae_arima:.2f}")
print(f"ARIMA MSE: {mse_arima:.2f}")

In [None]:
# Compute correlation of features with target
corrs = train_df[feature_cols + ['occupancy']].corr()['occupancy'].drop('occupancy')
corrs_abs = corrs.abs().sort_values(ascending=False)

# Select top N features (e.g., top 8)
top_n = 8
selected_features = corrs_abs.head(top_n).index.tolist()
print("Selected features:", selected_features)

# Use selected features for training
X_train_corr = train_df[selected_features].fillna(0)
X_test_corr = test_df[selected_features].fillna(0)

# Fit model
rf_corr = RandomForestRegressor(
    n_estimators=500,
    max_depth=20,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)
rf_corr.fit(X_train_corr, y_train)

# Predict
y_pred_rf_corr = rf_corr.predict(X_test_corr)

# Metrics
r2_rf_corr = r2_score(y_test, y_pred_rf_corr)
rmse_rf_corr = np.sqrt(mean_squared_error(y_test, y_pred_rf_corr))
mae_rf_corr = mean_absolute_error(y_test, y_pred_rf_corr)
mse_rf_corr = mean_squared_error(y_test, y_pred_rf_corr)

print(f"Random Forest (Corr) R2 Score: {r2_rf_corr:.4f}")
print(f"Random Forest (Corr) RMSE: {rmse_rf_corr:.2f}")
print(f"Random Forest (Corr) MAE: {mae_rf_corr:.2f}")
print(f"Random Forest (Corr) MSE: {mse_rf_corr:.2f}")

In [None]:
# Train XGBoost Regressor
xgb = XGBRegressor(n_estimators=100, random_state=42)
xgb.fit(X_train, y_train)

# Predict
y_pred_xgb = xgb.predict(X_test)

# Metrics
r2_xgb = r2_score(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)

print(f"XGBoost R2 Score: {r2_xgb:.4f}")
print(f"XGBoost RMSE: {rmse_xgb:.2f}")
print(f"XGBoost MAE: {mae_xgb:.2f}")
print(f"XGBoost MSE: {mse_xgb:.2f}")

In [None]:
metrics = {
    'Model': ['Linear Regression', 'Random Forest (Corr)', 'XGBoost', 'ARIMA'],
    'R2 Score': [r2_score_value_lr, r2_rf_corr, r2_xgb, r2_arima],
    'RMSE': [rmse_value_lr, rmse_rf_corr, rmse_xgb, rmse_arima],
    'MAE': [mae_value_lr, mae_rf_corr, mae_xgb, mae_arima],
    'MSE': [mse_value_lr, mse_rf_corr, mse_xgb, mse_arima]
}

metrics_df = pd.DataFrame(metrics)

In [None]:
# Plot each metric
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
metric_names = ['R2 Score', 'RMSE', 'MAE', 'MSE']

for ax, metric in zip(axes.flatten(), metric_names):
    metrics_df.plot(x='Model', y=metric, kind='barh', ax=ax, legend=False)
    ax.set_title(f'{metric} Comparison')
    ax.set_ylabel(metric)
    ax.set_xlabel('Model')
    ax.grid(True, axis='y')

plt.tight_layout()
plt.show()