### Author: Kyle Tranfaglia ###
### Course: DSCI 470 ###
### Assignment: The Project ###
### Last Updated: 10/23/24 ###

## Background: ##

## Questions: ## 
### How do rating differences and player Elo impact game outcomes and the predictability of the game results? ### 
### How do opening choices and event types differ across Elo ratings, and are certain openings more likely in specific events or Elo ranges? ###
### Which game features (event type, opening code, result) have the most predictive power for Elo, and how can these insights inform match-making criteria? ###

## Prepare for Analysis: Import Libraries and Read in Data

In [1]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Read in data set and strip beginning spaces from the 'Event' column
df_chess = pd.read_csv("Data/Big_chess_data.csv", nrows=1000000, converters= {'Event': lambda x: x.strip() if isinstance(x, str) else x})


## Utility Functions ##

In [8]:
# Set up the plotting style
sns.set_theme(style='whitegrid')


# Function to plot bar charts
def plot_frequencies(counts, title, xlabel, ylabel):
    # Handle frequencies with too many categories to appropriately disaplay (n > 50)
    top_n = 50
    counts = counts.nlargest(top_n)

    plt.figure(figsize=(12, 8))
    sns.barplot(x=counts.index, y=counts.values, hue=counts.index, legend=False, palette='viridis')
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.yscale('log')
    plt.xticks(rotation=90)  # Rotate x labels for better readability
    plt.show()


# Extract the first word of a column entry
def extract_first_word(entry):
    return entry.split()[0]


# Extract the first two words of a column entry
def extract_first_two_words(entry):
    words = entry.replace(":", "").split()  # Remove colons and split the string
    return ' '.join(words[:2]) if len(words) > 1 else words[0]

## Explore the Data ##

In [35]:
# Display some data, statistics, and summary info
# print(df_chess.describe())
# print("\n", df_chess.head())

# # Create a new column with generalized openings (group all opening variations)
# df_chess['generalized_Opening'] = df_chess['Opening'].apply(extract_first_two_words)

# # Get and display data fequencies for some features
# opening_counts = df_chess['generalized_Opening'].value_counts().sort_index()
# print("\nFrequency of", opening_counts)
# plot_frequencies(opening_counts, 'Frequencies of Openings', 'Opening', 'Count')

# ECO_counts = df_chess['ECO'].value_counts().sort_index()
# print("\nFrequency of", ECO_counts)
# plot_frequencies(ECO_counts, 'Frequencies of ECO', 'ECO', 'Count')

# time_control_counts = df_chess['TimeControl'].value_counts().sort_index()
# print("\nFrequency of", time_control_counts)
# plot_frequencies(time_control_counts, 'Frequencies of Time Control', 'Time Control', 'Count')

# event_counts = df_chess['Event'].value_counts().sort_index()
# print("\nFrequency of", event_counts)
# plot_frequencies(event_counts, 'Frequencies of Events', 'Event', 'Count')

# termination_counts = df_chess['Termination'].value_counts().sort_index()
# print("\nFrequency of", termination_counts)
# plot_frequencies(termination_counts, 'Frequencies of Terminations', 'Termination', 'Count')


# df_chess[['BlackElo', 'WhiteElo']].plot(kind='hist', bins=50, alpha=0.5, title='Elo Distribution', figsize=(10, 8))

# # Create the boxplot with subplots for White and Black Elo ratings
# fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

# # Boxplot for WhiteElo
# df_chess.boxplot(column='WhiteElo', by='Event', ax=axes[0])
# axes[0].set_title('White Elo Distribution by Event')
# axes[0].set_xlabel('Event')
# axes[0].set_ylabel('White Elo')

# # Boxplot for BlackElo
# df_chess.boxplot(column='BlackElo', by='Event', ax=axes[1])
# axes[1].set_title('Black Elo Distribution by Event')
# axes[1].set_xlabel('Event')
# axes[1].set_ylabel('Black Elo')

# # Rotate x-axis labels for both subplots
# for ax in axes:
#     plt.sca(ax)
#     plt.xticks(rotation=45, ha='right')

# plt.suptitle('')  # Remove the automatic generated title
# plt.show()  # Show the plot

# avg_opening_elo = df_chess.groupby('generalized_Opening')[['WhiteElo', 'BlackElo']].mean()

# Get the top 10 most frequent openings for each event
top_openings_by_event = (
    df_chess.groupby('Event')['generalized_Opening']
    .value_counts()
    .groupby(level=0)
    .nlargest(5)
    .reset_index(level=0, drop=True)
)

# Convert the Series to a DataFrame 
top_openings_by_event = top_openings_by_event.reset_index(name='Frequency')

total_games_by_event = df_chess['Event'].value_counts()

top_openings_by_event['Percentage'] = top_openings_by_event.apply(
    lambda row: f"{(row['Frequency'] / total_games_by_event[row['Event']]) * 100:.2f}%", axis=1
)
print(top_openings_by_event)


# Calculate the average Elo for each event type
avg_elo_by_event = df_chess.groupby('Event')[['WhiteElo', 'BlackElo']].mean()

# Optional: You can add an additional column to represent the combined average Elo (average of White and Black Elo)
avg_elo_by_event['AvgElo'] = avg_elo_by_event.mean(axis=1)

# Store the result
print(avg_elo_by_event)







                   Event   generalized_Opening  Frequency Percentage
0                  Blitz      Sicilian Defense      49206     13.13%
1                  Blitz        French Defense      27997      7.47%
2                  Blitz          Queen's Pawn      25640      6.84%
3                  Blitz        Queen's Gambit      17548      4.68%
4                  Blitz  Scandinavian Defense      17426      4.65%
5                  Blitz           King's Pawn      13402      3.58%
6                  Blitz     Caro-Kann Defense      12771      3.41%
7       Blitz tournament      Sicilian Defense      11808     15.58%
8       Blitz tournament        French Defense       5387      7.11%
9       Blitz tournament          Queen's Pawn       4988      6.58%
10      Blitz tournament  Scandinavian Defense       3640      4.80%
11      Blitz tournament     Caro-Kann Defense       3308      4.36%
12      Blitz tournament        Queen's Gambit       3264      4.31%
13      Blitz tournament       Eng

## Explore the Question: How do rating differences and player Elo impact game outcomes and the predictability of the game results? ##