### Author: Kyle Tranfaglia ###
### Course: DSCI 470 ###
### Assignment: The Project ###
### Last Updated: 11/04/24 ###

## Questions: ## 
### How do opening choices and event types differ across Elo ratings, and are certain openings more likely in specific events or Elo ranges? ###
### How do rating differences and player Elo impact game outcomes and the predictability of the game results? ### 
### Which game features, along with Elo difference, helps to predict the game result, and how can these insights inform match-making criteria? ###

## Prepare for Analysis: Import Libraries and Read in Data

In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, classification_report, accuracy_score, ConfusionMatrixDisplay


# Read in data set and strip beginning spaces from the 'Event' column
df_chess = pd.read_csv("Data/Big_chess_data.csv", converters= {'Event': lambda x: x.strip() if isinstance(x, str) else x})


## Utility Functions ##

In [None]:
# Set up the plotting style
sns.set_theme(style='whitegrid')


# Function to plot bar charts
def plot_frequencies(counts, title, xlabel, ylabel):
    # Handle frequencies with too many categories to appropriately disaplay (n > 50)
    top_n = 50
    counts = counts.nlargest(top_n)

    plt.figure(figsize=(12, 8))
    sns.barplot(x=counts.index, y=counts.values, hue=counts.index, legend=False, palette='viridis')
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.yscale('log')
    plt.xticks(rotation=90)  # Rotate x labels for better readability
    plt.show()


# Extract the first word of a column entry
def extract_first_word(entry):
    return entry.split()[0]


# Extract the first two words of a column entry
def extract_first_two_words(entry):
    words = entry.replace(":", "").replace(",", "").split()  # Remove colons and commas, then split the string
    return ' '.join(words[:2]) if len(words) > 1 else words[0]


# Assign a player strength classification to an Elo
def classify_elo(elo):
    if elo <= 1200:
        return 'Beginner'
    elif elo <= 1800:
        return 'Intermediate'
    elif elo <= 2000:
        return 'Advanced'
    elif elo <= 2200:
        return 'Expert'
    else:
        return 'Master'

## Explore the Data ##

In [None]:
'''Display basic information on the data'''
# Display some data, statistics, and summary info
print(df_chess.describe())
print("\n", df_chess.head())

'''Add five new columns to the dataset for analysis'''
# Create a new column for the Elo group of each player
df_chess['WhiteEloGroup'] = df_chess['WhiteElo'].apply(classify_elo)
df_chess['BlackEloGroup'] = df_chess['BlackElo'].apply(classify_elo)

# Calculate average Elo and Elo group based on the average Elo for each game and add to the DataFrame
df_chess['AvgElo'] = df_chess[['WhiteElo', 'BlackElo']].mean(axis=1)
df_chess['EloGroup'] = df_chess['AvgElo'].apply(classify_elo)

# Create a new column with generalized openings (group all opening variations)
df_chess['generalized_Opening'] = df_chess['Opening'].apply(extract_first_two_words)

'''Get the mode for each feature, display the values in descending order, and represent it with a bar plot figure'''
# Get and display data fequencies for some features
opening_counts = df_chess['generalized_Opening'].value_counts().sort_values(ascending=False)
print("\nFrequency of", opening_counts)
plot_frequencies(opening_counts, 'Frequencies of Openings', 'Opening', 'Count')

ECO_counts = df_chess['ECO'].value_counts().sort_values(ascending=False)
print("\nFrequency of", ECO_counts)
plot_frequencies(ECO_counts, 'Frequencies of ECO', 'ECO', 'Count')

time_control_counts = df_chess['TimeControl'].value_counts().sort_values(ascending=False)
print("\nFrequency of", time_control_counts)
plot_frequencies(time_control_counts, 'Frequencies of Time Control', 'Time Control', 'Count')

event_counts = df_chess['Event'].value_counts().sort_values(ascending=False)
print("\nFrequency of", event_counts)
plot_frequencies(event_counts, 'Frequencies of Events', 'Event', 'Count')

termination_counts = df_chess['Termination'].value_counts().sort_values(ascending=False)
print("\nFrequency of", termination_counts)
plot_frequencies(termination_counts, 'Frequencies of Terminations', 'Termination', 'Count')

elo_group_counts = df_chess['EloGroup'].value_counts().sort_values(ascending=False)
print("\nFrequency of", elo_group_counts)
plot_frequencies(elo_group_counts, 'Frequencies of Elo Groups', 'Elo Group', 'Count')

'''Plot a historgram and boxplots for Elo distributions by event'''
df_chess[['AvgElo']].plot(kind='hist', bins=50, alpha=0.5, title='Elo Distribution', figsize=(10, 8))

# Create the boxplot for Average Elo ratings by Event
plt.figure(figsize=(10, 8))
df_chess.boxplot(column='AvgElo', by='Event')

plt.title('Average Elo Distribution by Event')
plt.xlabel('Event')
plt.ylabel('Average Elo')

plt.xticks(rotation=45, ha='right')
plt.suptitle('')  # Remove the automatic generated title
plt.show()

## Explore the Question: How do opening choices and event types differ across Elo ratings, and are certain openings more likely in specific events or Elo ranges? ##

In [None]:
'''Calculate and display the average elo for black, white, and combined with a classification for the elo for each generalized opening'''
# Get the average elo of players that use each opening by grouping 'generalized_Opening' and 'WhiteElo' and 'BlackElo'
avg_opening_elo = df_chess.groupby('generalized_Opening')[['WhiteElo', 'BlackElo']].mean()

# Create a new column for combined average Elo & categorize each opening based on its average Elo
avg_opening_elo['AvgElo'] = avg_opening_elo.mean(axis=1)
avg_opening_elo['EloGroup'] = avg_opening_elo['AvgElo'].apply(classify_elo)
print("Elo by Chess Opening", avg_opening_elo)

# Group the openings by the average Elo Group and display them
print("\nOpening by Average Elo Group")
for group, openings in avg_opening_elo.groupby('EloGroup'):
    print(f"\nElo Group: {group}")
    print(openings[['AvgElo']].sort_values(by='AvgElo', ascending=False))

# Bar plot for average Elo by opening with classification color
plt.figure(figsize=(12, 8))
sns.barplot(
    x=avg_opening_elo.index,
    y=avg_opening_elo['AvgElo'],
    hue=avg_opening_elo['EloGroup'],
    palette="coolwarm"
)
plt.xticks([])  # Remove x-axis labels
plt.title('Average Elo by Generalized Opening')
plt.xlabel('Generalized Opening A -> Z')
plt.ylabel('Average Elo')
plt.legend(title='Elo Classification')
plt.show()

'''Use the elo group frequencies for each opening and the total number of players in each elo group to normalize the mode of each 
opening within an elo group and display the most frequent elo group for each generalized opening'''

# Combine White and Black Elo groups into a single DataFrame for counting
elo_groups_combined = pd.concat([
    df_chess[['generalized_Opening', 'WhiteEloGroup']].rename(columns={'WhiteEloGroup': 'EloGroup'}),
    df_chess[['generalized_Opening', 'BlackEloGroup']].rename(columns={'BlackEloGroup': 'EloGroup'})])

# Count the frequency of each Elo group for each opening & count the total number of players in each Elo group
elo_group_counts = elo_groups_combined.groupby(['generalized_Opening', 'EloGroup']).size().reset_index(name='Count')
total_players_by_elo_group = elo_group_counts.groupby('EloGroup')['Count'].sum()

# Normalize the frequency of openings within each Elo group by dividing by the total number of players in that group
# This is significant as the elo groups are inbalanced which would lead to bias
elo_group_counts['NormalizedCount'] = elo_group_counts.apply(
    lambda row: row['Count'] / total_players_by_elo_group[row['EloGroup']], axis=1)

# For each opening, find the Elo group with the highest frequency
most_frequent_elo_group = (
    elo_group_counts.groupby('generalized_Opening')
    .apply(lambda x: x.loc[x['NormalizedCount'].idxmax()])  # Get the row with the highest count for each opening
    .reset_index(drop=True))
print("\nOpeining by Most Frequent Elo Group", most_frequent_elo_group)

# Iterate through the groups and display the openings in each Elo Group
print("\nOpening by Most Frequent Elo Group")
for group, openings in most_frequent_elo_group.groupby('EloGroup'):
    print(f"\nElo Group: {group}")
    print(openings[['generalized_Opening', 'NormalizedCount']].sort_values(by='NormalizedCount', ascending=False))

# Pivot for stacked bar chart of Elo group frequencies by opening
elo_group_pivot = elo_group_counts.pivot(index='generalized_Opening', columns='EloGroup', values='NormalizedCount')
elo_group_pivot = elo_group_pivot.fillna(0)  # Fill NaN with zero for missing counts
elo_group_pivot = elo_group_pivot[elo_group_pivot.apply(lambda row: row.max() > 0.05, axis=1)]  # Filter to show only elements with frequency > 0.05

# Stacked bar chart
elo_group_pivot.plot(kind='bar', stacked=True, figsize=(14, 8), colormap='viridis')
plt.title('Most Frequent Elo Group by Generalized Opening')
plt.xlabel('Generalized Opening')
plt.ylabel('Normalized Frequency')
plt.xticks(rotation=90)
plt.legend(title='Elo Group')
plt.show()

'''Calculate and display the top 5 most frequent openings played for each elo group with a percentage of the total games that the opening was played'''
# Calculate the frequency of each 'generalized_Opening' within each 'EloGroup'
top_openings_by_elo_group = (
    df_chess.groupby('EloGroup')['generalized_Opening']
    .value_counts()
    .groupby(level=0)
    .nlargest(5)
    .reset_index(level=0, drop=True)
).reset_index(name='Frequency')

total_games_by_elo_group = df_chess['EloGroup'].value_counts()  # Calculate the total games for each Elo group

# Calculate the percentage of total games each opening represents within its Elo group
top_openings_by_elo_group['Percentage'] = top_openings_by_elo_group.apply(
    lambda row: f"{(row['Frequency'] / total_games_by_elo_group[row['EloGroup']]) * 100:.2f}%", axis=1)
print("\nTop Openings by Elo Group", top_openings_by_elo_group)

# Convert the 'Percentage' column to numeric values, removing the '%' symbol
top_openings_by_elo_group['PercentageNumeric'] = top_openings_by_elo_group['Percentage'].str.rstrip('%').astype(float)

# Define the desired order of Elo groups
elo_order = ['beginner', 'intermediate', 'advanced', 'expert', 'master']

# Ensure 'EloGroup' is a categorical column with the specified order
top_openings_by_elo_group['EloGroup'] = pd.Categorical(
    top_openings_by_elo_group['EloGroup'], categories=elo_order, ordered=True
)

# Sort the data by 'EloGroup' to reflect the desired order
top_openings_by_elo_group = top_openings_by_elo_group.sort_values('EloGroup')

# Horizontal bar charts for top 5 openings per Elo group using percentage
fig, axes = plt.subplots(nrows=len(elo_order), figsize=(10, 16), sharex=True)

for i, elo_group in enumerate(elo_order):
    subset = top_openings_by_elo_group[top_openings_by_elo_group['EloGroup'] == elo_group]
    subset = subset.sort_values('PercentageNumeric', ascending=True)  # Sort to have largest on top
    axes[i].barh(subset['generalized_Opening'], subset['PercentageNumeric'], color='skyblue')
    axes[i].set_title(f"Top Openings in Elo Group {elo_group.capitalize()}")
    axes[i].set_xlabel("Percentage (%)")
    axes[i].set_ylabel("Opening")
    for index, value in enumerate(subset['Percentage']):  # Add text labels for percentages
        axes[i].text(subset['PercentageNumeric'].iloc[index], index, value, ha='right', va='center', fontsize=9)

plt.tight_layout()
plt.show()

'''Calculate and display the top 5 most frequent openings played for each event with a percentage of the total games that the opening was played'''
# Group 'Event' and frequencies of each 'generalized_Opening' within each event, then get the top 5 most frequent openings
top_openings_by_event = (
    df_chess.groupby('Event')['generalized_Opening']
    .value_counts()
    .groupby(level=0)
    .nlargest(5)
    .reset_index(level=0, drop=True)
).reset_index(name='Frequency')

total_games_by_event = df_chess['Event'].value_counts()  # Get the total amount of games played for each event 

# Calculate the percentage of the total games that an opening was played for each event, and create a formatted column 
top_openings_by_event['Percentage'] = top_openings_by_event.apply(
    lambda row: f"{(row['Frequency'] / total_games_by_event[row['Event']]) * 100:.2f}%", axis=1)
print("\nTop Openings by Event", top_openings_by_event)

# Calculate the percentage for each opening within each event
top_openings_by_event['Percentage'] = top_openings_by_event.apply(
    lambda row: (row['Frequency'] / total_games_by_event[row['Event']]) * 100, axis=1
)

# Plot the grouped bar chart with percentages
fig, ax = plt.subplots(figsize=(14, 8))
sns.barplot(
    x='Event', 
    y='Percentage', 
    hue='generalized_Opening', 
    data=top_openings_by_event,
    palette='cool',
    dodge=True
)
plt.xticks(rotation=90)
plt.title('Top 5 Openings by Event (Percentage)')
plt.xlabel('Event')
plt.ylabel('Percentage (%)')
plt.legend(title='Opening')
plt.show()

'''Calculate and display the average black, white, and combined elo for every event type with a classification group for each event'''
# Calculate the average Elo for each event type and create an additional column to represent the combined average Elo and an elo group
avg_elo_by_event = df_chess.groupby('Event')[['WhiteElo', 'BlackElo']].mean()
avg_elo_by_event['AvgElo'] = avg_elo_by_event.mean(axis=1)
avg_elo_by_event = avg_elo_by_event.rename(columns={
    'WhiteElo': 'Avg White Elo',
    'BlackElo': 'Avg Black Elo',
    'AvgElo': 'Avg Combined Elo'})
avg_elo_by_event['Elo Classification'] = avg_elo_by_event['Avg Combined Elo'].apply(classify_elo)
print("\nElo and Elo Group by Event", avg_elo_by_event)

# Scatter plot for average Elo by event type with unique colors for each event
plt.figure(figsize=(12, 8))

# Plot each event type with a different color
sns.scatterplot(
    x=avg_elo_by_event['Avg White Elo'],
    y=avg_elo_by_event['Avg Black Elo'],
    hue=avg_elo_by_event.index,  # Use the event type as the hue
    palette='tab20',  # Use a palette with many distinct colors
    s=100,
    edgecolor='black'
)

# Plot formatting
plt.title('Average White and Black Elo by Event Type')
plt.xlabel('Average White Elo')
plt.ylabel('Average Black Elo')
plt.legend(title='Event Type', bbox_to_anchor=(1.05, 1), loc='upper left')  # Move legend outside for clarity
plt.show()

'''Calculate and display the most frequent elo group in each event using normalized counts'''
# Combine White and Black Elo groups into a single DataFrame for frequency calculation
elo_groups_combined = pd.concat([
    df_chess[['Event', 'WhiteEloGroup']].rename(columns={'WhiteEloGroup': 'EloGroup'}),
    df_chess[['Event', 'BlackEloGroup']].rename(columns={'BlackEloGroup': 'EloGroup'})])

# Count the occurrences of each Elo group for each event & calculate the total number of players in each Elo group
elo_group_counts_by_event = elo_groups_combined.groupby(['Event', 'EloGroup']).size().reset_index(name='Count')
total_players_by_elo_group = elo_group_counts_by_event.groupby('EloGroup')['Count'].sum()

# Normalize the frequency of each Elo group within each event
elo_group_counts_by_event['NormalizedCount'] = elo_group_counts_by_event.apply(
    lambda row: row['Count'] / total_players_by_elo_group[row['EloGroup']], axis=1
)

# Find the most frequent Elo group for each event based on normalized counts
most_frequent_elo_group_by_event = (
    elo_group_counts_by_event.groupby('Event', group_keys=False)
    .apply(lambda x: x.loc[x['NormalizedCount'].idxmax()])  # Get the row with the highest normalized count for each event
    .reset_index(drop=True)
)
print("\nMost Frequent Elo Group by Event", most_frequent_elo_group_by_event)

# Pivot for heatmap data
heatmap_data = elo_group_counts_by_event.pivot(index='Event', columns='EloGroup', values='NormalizedCount').fillna(0)

# Heatmap of normalized Elo group frequencies by event
plt.figure(figsize=(12, 10))
sns.heatmap(heatmap_data, cmap='coolwarm', annot=True, fmt=".2f", linewidths=.5)
plt.title('Most Frequent Elo Group by Event (Normalized)')
plt.xlabel('Elo Group')
plt.ylabel('Event')
plt.show()

## Explore the Question: How do rating differences and player Elo groups impact game outcomes and the predictability of the game results? ##

In [None]:
'''Bin the Elo differences and calculate the outcome probabilities within each Elo bin and each Elo Group for each Elo bin'''
# Calculate the Elo difference between White and Black (keeping sign to indicate advantage)
df_chess['EloDifference'] = df_chess['WhiteElo'] - df_chess['BlackElo']

# Define Elo difference bins and labels
elo_diff_bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 250, 300, 350, 400, 450, 500, float('inf')]
elo_diff_labels = ['0 to 10', '11 to 20', '21 to 30', '31 to 40', '41 to 50', '51 to 60', '61 to 70', '71 to 80', '81 to 90', '91 to 100', '100 to 150', '151 to 200', '201 to 250', '251 to 300', '301 to 350', '351 to 400', '401 to 450', '451 to 500', '500+']

# Create a new column for Elo difference bins
df_chess['EloDiffBin'] = pd.cut(df_chess['EloDifference'], bins=elo_diff_bins, labels=elo_diff_labels)

# Calculate outcome probabilities within each EloDiffBin
outcome_probs_by_diff = (
    df_chess.groupby('EloDiffBin', observed=False)['Result']
    .value_counts(normalize=True)
    .unstack(fill_value=0)
    .drop(columns='*', errors='ignore')
)
print("\nWhite win Probability by Elo Difference\n", outcome_probs_by_diff)

outcome_probs_by_diff.plot(kind='bar', stacked=True, figsize=(12, 8), color=['green', 'blue', 'red'])
plt.title('Game Outcome Probabilities by Elo Difference (White Advantage)')
plt.xlabel('Elo Difference (White - Black)')
plt.ylabel('Probability')
plt.legend(title='Result')
plt.xticks(rotation=45)
plt.show()

# Calculate outcome probabilities for each Elo Group and Elo difference bin
outcome_probs_by_elo_and_diff = (
    df_chess.groupby(['WhiteEloGroup', 'EloDiffBin'], observed=False)['Result']
    .value_counts(normalize=True)
    .unstack(fill_value=0)
    .drop(columns='*', errors='ignore')
)
print("\nWhite win Probability for each Elo Group by Elo Difference\n", outcome_probs_by_elo_and_diff)

plt.figure(figsize=(12, 10))
sns.heatmap(outcome_probs_by_elo_and_diff.loc[:, '1-0'].unstack(), annot=True, fmt=".2f", cmap="YlGnBu")
plt.title('White Win Probability by Elo Group and Elo Difference Bin')
plt.xlabel('Elo Difference Bin')
plt.ylabel('White Elo Group')
plt.show()

'''Calculat and plot white win ratios'''
# Calculate win ratios by EloDiffBin
df_chess['BinaryResult'] = df_chess['Result'].apply(lambda x: 1 if x == '1-0' else 0)
win_ratios = (
    df_chess.groupby('EloDiffBin', observed=False)['BinaryResult']
    .mean()
)
print("\nWhite win ratios for each Elo Group by Elo Difference\n", win_ratios)

win_ratios.plot(kind='line', marker='o', color='green', figsize=(12, 8))
plt.title('White Win Ratios by Elo Difference Bin')
plt.xlabel('Elo Difference (White - Black)')
plt.ylabel('White Win Probability')
plt.grid(True)
plt.show()

'''Train a Logistic Regression model on Elo data to predict game outcomes as binary results'''
# Select features and labels for model input
X = df_chess[['EloDifference', 'WhiteElo', 'BlackElo', 'AvgElo']]
y = df_chess['BinaryResult']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, test_size=0.1, random_state=42)

# Train the logistic regression model
binary_result_model = LogisticRegression()
binary_result_model.fit(X_train, y_train)

# Evaluate the model on the test data
binary_result_predictions = binary_result_model.predict(X_test)
binary_result_accuracy = accuracy_score(y_test, binary_result_predictions)
print("Binary Result Model Classification Report:\n", classification_report(y_test, binary_result_predictions), 
      "Binary Result Model Accuracy:", binary_result_accuracy)

# Binary model confusion matrix
plt.figure(figsize=(12, 10))
disp = ConfusionMatrixDisplay.from_predictions(y_test, binary_result_predictions)
plt.title('Confusion Matrix for Binary Result Model')
plt.xticks([0, 1], ["loss", "win"])
plt.yticks([0, 1], ["loss", "win"])
plt.grid(False)
plt.show()



## Explore the Question: Which game features, along with Elo difference, helps to predict the game result, and how can these insights inform match-making criteria? ##

In [None]:
'''Plot the relation between game results and elo difference and find the correlation between game results and elo difference'''
# Visualize Game results in relation to Elo Difference
df_chess_filtered = df_chess[df_chess['Result'] != '*']  # Filter out rows with '*' in the 'Result' column
sns.boxplot(x='Result', y='EloDifference', data=df_chess_filtered)
plt.title('Elo Difference vs Game Result')
plt.show()

# One-hot encode categorical features
df_chess_encoded = pd.get_dummies(df_chess.sample(n=1000000, random_state=42), columns=['Event', 'TimeControl', 'generalized_Opening'], drop_first=True)

# Correlation analysis with encoded data
correlation = df_chess_encoded[['EloDifference', 'BinaryResult']].corr()
print("Correlation:\n", correlation)

'''Use one-hot encoding and further prepare data for model training and testing, then train and test a random forest model, then determine the feature importances'''
# Calculate White win probability by EloDiffBin
white_win_probs_by_diff = (
    df_chess[df_chess['Result'] == '1-0']  # Filter only White wins
    .groupby('EloDiffBin', observed=False)['BinaryResult']
    .mean()
)

# Fill NaN values with 0 if there are EloDiffBins with no White wins
df_chess['WhiteWinProbability'] = df_chess['EloDiffBin'].map(white_win_probs_by_diff).fillna(0)

# For testing purposes, use a smaller subset of the data
df_chess_small = df_chess.sample(n=20000, random_state=42)  # Use a subset

# One-hot encode categorical features for a smaller dataset
df_chess_encoded = pd.get_dummies(df_chess_small, columns=['Event', 'TimeControl', 'generalized_Opening'], drop_first=True)

# Prepare features and target for Random Forest
X = df_chess_encoded[['EloDifference', 'WhiteWinProbability'] + 
                     [col for col in df_chess_encoded.columns if 
                      col.startswith('Event_') or 
                      col.startswith('TimeControl_') or 
                      col.startswith('generalized_Opening_')]]
y = df_chess_encoded['BinaryResult']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Random Forest model (use all cores for parallelism)
model_rf = RandomForestClassifier(random_state=42, n_jobs=-1)
model_rf.fit(X_train, y_train)

# Perform cross-validation and get predictions and calculate accuracy using cross-validation scores
y_pred = cross_val_predict(model_rf, X, y, cv=3)  # 3-fold cross-validation for speed
cv_scores = cross_val_score(model_rf, X, y, cv=3, scoring='accuracy')
print("Classification Report:\n", classification_report(y, y_pred))
print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Average Accuracy: {cv_scores.mean():.4f}")
print("Accuracy Score: ", accuracy_score(y_test, model_rf.predict(X_test)))

# Make and plot confusion matrix
cm = confusion_matrix(y, y_pred)
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Loss", "Win"], yticklabels=["Loss", "Win"])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Get feature importance
feature_importances = model_rf.feature_importances_
features = X.columns
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
print("\nFeature Importances:\n", feature_importance_df.sort_values(by='Importance', ascending=False))


