# ML Project- Chess games

#### Idan Vazana, 204154207
#### Karin Tatzat, 201048691
#### Keren Kaplan, 205681646

## Import & Read data

In [3]:
import pandas as pd
import random
import datetime
import numpy as np
import os
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score, classification_report, confusion_matrix, plot_confusion_matrix
#from sklearn.metrics import classification_report, confusion_matrix
#from sklearn.metrics import confusion_matrix
#from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
#from sklearn.metrics import plot_confusion_matrix
from sklearn.decomposition import PCA 
from sklearn.cluster import KMeans

import plotly as py
import plotly.graph_objs as go
from plotly import tools
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

from string import ascii_letters
#from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt

from scipy.stats import chi2_contingency, f_oneway

from category_encoders.count import CountEncoder

from sklearn.metrics import ConfusionMatrixDisplay
from yellowbrick.cluster import KElbowVisualizer

ModuleNotFoundError: No module named 'plotly'

In [None]:
df = pd.read_csv('/Users/idanvazana/Desktop/games.csv')

In [None]:
df.head(1)

## Preparation

### Handling with duplicates

In [None]:
df.shape

In [None]:
df = df.drop_duplicates()
df.shape

###### we saw that there are duplicates rows with the same id game 
###### we decided to remove them

### Creating new columns

In [None]:
df.columns

##### create an integer columns for black and white id and game id:

In [None]:
df['white_id_int'] = pd.factorize(df['white_id'])[0]

In [None]:
df['black_id_int'] = pd.factorize(df['black_id'])[0]

In [None]:
df['id_int'] = pd.factorize(df['id'])[0]

##### create a column that calculate the difference between the black and white rating:

Assuming that the bigger the difference, the higher the chance for the higher ranked to win

In [None]:
df['rating_difference'] = df['white_rating'] - df['black_rating']

 ##### create a column - opening pref based on opening name: 

In [None]:
# Reducing opening name to two words for grouping
df['opening_pref'] = df['opening_name'].apply(lambda x: ' '.join(x.split(' ')[:2]))
df['opening_pref'] = df['opening_pref'].apply(lambda x: x[:-1] if str(x).endswith(':') else x)

##### create a column time_control based on increment time  

In [None]:
def time_control(df):
    time_control = [int(x) for x in df['increment_code'].split('+')]
    return time_control[0] + np.floor((time_control[1] * df['turns']/2) / 60)
df['time_control'] = df.apply(time_control, axis=1)

##### create 5 new columns for the first moves and save into new dataframe:

In [None]:
## Split 'moves' column into 5 new columns for the first 5 moves:
df = df.assign(move1=df['moves'].str.split(" ").str[0],
               move2=df['moves'].str.split(" ").str[1],
               move3=df['moves'].str.split(" ").str[2],
               move4=df['moves'].str.split(" ").str[3],
               move5=df['moves'].str.split(" ").str[4])

# Extract the first five elements from the 'moves' column
#df = pd.concat([df.drop(['moves'], axis=1)], axis=1)

##### Dropping draw column and values, to fit problem into binnary classification:

In [None]:
df = df[~df['winner'].isin(['draw'])]

##### Dropping dates columns

In [None]:
df.drop(['created_at', 'last_move_at'], axis=1, inplace=True)

##### create dummies dataset for the 5 first moves

In [None]:
df_dummies = pd.get_dummies(df, columns=['winner', 'victory_status', 'move1', 'move2', 'move3', 'move4', 'move5'], drop_first=False)

In [None]:
print(df_dummies.shape)
print(df.shape)

In [None]:
df_merge = pd.merge(df_dummies, df[['id_int','winner', 'victory_status', 'move1', 'move2', 'move3', 'move4', 'move5']], on = 'id_int', how = 'left')

## Columns explanation:

## Exploration

In [None]:
#Sum values numbers for each feature
for column in df:
    try: 
        unique_vals = np.unique(df[column])
        nr_values = len(unique_vals)
        if nr_values < 5:
            print ('The number of values for feature {} : {} -- {}'.format(column, nr_values,unique_vals))
        else:
            print ('The number of values for feature {} : {}'.format(column, nr_values))
    except:
        print ("Columns ",column, " contains Null Values")

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
print(df['winner'].unique())
print(df['victory_status'].unique())

## Analysis

### One dimentional analysis

In [None]:
df.columns

In [None]:
df.head(3)

####  rated

In [None]:
df_rated = df['rated'].value_counts().reset_index()
df_rated

In [None]:
value_counts = df['rated'].value_counts()
value_counts_percent = value_counts / len(df) * 100
value_counts_percent.plot(kind='bar')
plt.title('Percentage of Value Counts')
plt.xlabel('Number of rated')
plt.ylabel('Percentage (%)')
plt.show()

In [None]:
cont_features = ['rated']
label = 'winner'

dic = {'Categorical': [],
    'Numerical': [],
    'p-value': [],
    'p < 0.05': [],
    'statistic': []}


for feature in cont_features:
    values = []
    for value in df[label].unique():
        values.append(df[df[label] == value][feature].values)
    
    statistic, pval = f_oneway(*values)
    
    dic['Categorical'].append(label)
    dic['Numerical'].append(feature)
    dic['p-value'].append(pval)
    dic['p < 0.05'].append(pval<0.05)
    dic['statistic'].append(statistic)


pd.DataFrame(dic)

###### The rated feature isn't a good predictor of the winner - because the p-valueis more than 0.05

#### turnes

In [None]:
df['turns'].describe()

In [None]:
sns.boxplot(data=df, x='turns', orient="h")
plt.show()

In [None]:
df_turns = pd.DataFrame({"turns": df['turns']},columns=["turns"])
df_turns.plot.hist(alpha=0.3, bins=15,color='turquoise');

In [None]:
cont_features = ['turns']
label = 'winner'

dic = {'Categorical': [],
    'Numerical': [],
    'p-value': [],
    'p < 0.05': [],
    'statistic': []}


for feature in cont_features:
    values = []
    for value in df[label].unique():
        values.append(df[df[label] == value][feature].values)
    
    statistic, pval = f_oneway(*values)
    
    dic['Categorical'].append(label)
    dic['Numerical'].append(feature)
    dic['p-value'].append(pval)
    dic['p < 0.05'].append(pval<0.05)
    dic['statistic'].append(statistic)


pd.DataFrame(dic)

###### The "turns" feature is a good predictor of the winner - because the p-value is under 0.05

#### rating_difference

In [None]:
df.rating_difference.describe()

In [None]:
plt.figure(figsize=(10,7))
plt.hist(df['rating_difference'],alpha=0.6)
plt.title("Difference between white rating and black rating")
plt.xlabel('difference')
plt.ylabel("count")
plt.show()

##### As wee see, in most cases, games are relatively fair (both players have similar rating). But there is decent number of games where the discrepancy is relatively large.

In [None]:
dataframe = df
feature_1 = 'winner'
feature_2 = 'rating_difference'
plt.figure(figsize=(7,7))
sns.boxplot(x=feature_1, y=feature_2, data=dataframe)
plt.show()

##### In rating_difference parameter we see a slight tendency towards the white player

In [None]:
cont_features = ['rating_difference']
label = 'winner'

dic = {'Categorical': [],
    'Numerical': [],
    'p-value': [],
    'p < 0.05': [],
    'statistic': []}


for feature in cont_features:
    values = []
    for value in df[label].unique():
        values.append(df[df[label] == value][feature].values)
    
    statistic, pval = f_oneway(*values)
    
    dic['Categorical'].append(label)
    dic['Numerical'].append(feature)
    dic['p-value'].append(pval)
    dic['p < 0.05'].append(pval<0.05)
    dic['statistic'].append(statistic)


pd.DataFrame(dic)

##### The difference between ratings is indeed a good predictor of the winner - because the p-value is under 0.05

#### white and black rating

In [None]:
df[['white_rating','black_rating']].describe().T

In [None]:
plt.hist(df['white_rating'], bins=10, alpha=0.5, label='white')
plt.hist(df['black_rating'], bins=10, alpha=0.5, label='black')
plt.legend()
plt.show()

In [None]:
cont_features = ['white_rating']
label = 'winner'

dic = {'Categorical': [],
    'Numerical': [],
    'p-value': [],
    'p < 0.05': [],
    'statistic': []}


for feature in cont_features:
    values = []
    for value in df[label].unique():
        values.append(df[df[label] == value][feature].values)
    
    statistic, pval = f_oneway(*values)
    
    dic['Categorical'].append(label)
    dic['Numerical'].append(feature)
    dic['p-value'].append(pval)
    dic['p < 0.05'].append(pval<0.05)
    dic['statistic'].append(statistic)


pd.DataFrame(dic)

###### The "white_rating" feature is a good predictor of the winner - because the p-value is under 0.05

In [None]:
cont_features = ['black_rating']
label = 'winner'

dic = {'Categorical': [],
    'Numerical': [],
    'p-value': [],
    'p < 0.05': [],
    'statistic': []}


for feature in cont_features:
    values = []
    for value in df[label].unique():
        values.append(df[df[label] == value][feature].values)
    
    statistic, pval = f_oneway(*values)
    
    dic['Categorical'].append(label)
    dic['Numerical'].append(feature)
    dic['p-value'].append(pval)
    dic['p < 0.05'].append(pval<0.05)
    dic['statistic'].append(statistic)


pd.DataFrame(dic)

###### The "black_rating" feature is a good predictor of the winner - because the p-value is under 0.05

#### victory_status

In [None]:
df['victory_status'].describe()

In [None]:
value_counts = df['victory_status'].value_counts()
value_counts_percent = value_counts / len(df) * 100
value_counts_percent.plot(kind='bar')
plt.title('Percentage of Value Counts')
plt.xlabel('Number of victory_status')
plt.ylabel('Percentage (%)')
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(6, 6)})
sns.countplot(data=df, x='victory_status', hue='winner', palette='inferno').set(title='Counts of Victory Status')
plt.xticks(rotation = 90)
plt.show()

#### opening_pref

In [None]:
df['opening_pref'].describe()

In [None]:
value_counts = df['opening_pref'].value_counts()
value_counts = value_counts.head(10)
value_counts_percent = value_counts / len(df) * 100
value_counts_percent.plot(kind='bar')
plt.title('Percentage of Value Counts')
plt.xlabel('Number of opening_pref')
plt.ylabel('Percentage (%)')
plt.show()

In [None]:
df[['opening_eco','opening_pref']].loc[df['opening_pref'] == 'French Defense'].head()

In [None]:
df[(df['opening_eco'] == 'C00')][['opening_eco','opening_pref']].drop_duplicates()

##### opening eco and opening pref which based on opening name are related but also has multiple values

#### time_control

In [None]:
df['time_control'].describe()

In [None]:
plt.figure(figsize=(10,7))
plt.hist(df['time_control'],alpha=0.6)
plt.title("Distirution of time_control")
plt.xlabel('time_control')
plt.show()

In [None]:
dataframe = df
feature_1 = 'winner'
feature_2 = 'time_control'
plt.figure(figsize=(7,7))
sns.boxplot(x=feature_1, y=feature_2, data=dataframe)
plt.show()

In [None]:
cont_features = ['time_control']
label = 'winner'

dic = {'Categorical': [],
    'Numerical': [],
    'p-value': [],
    'p < 0.05': [],
    'statistic': []}


for feature in cont_features:
    values = []
    for value in df[label].unique():
        values.append(df[df[label] == value][feature].values)
    
    statistic, pval = f_oneway(*values)
    
    dic['Categorical'].append(label)
    dic['Numerical'].append(feature)
    dic['p-value'].append(pval)
    dic['p < 0.05'].append(pval<0.05)
    dic['statistic'].append(statistic)


pd.DataFrame(dic)

###### The time_control feature isn't a good predictor of the winner - because the p-valueis more than 0.05

#### winner

In [None]:
df['winner'].describe()

In [None]:
df['winner'].value_counts()

In [None]:
sns.set(rc={'figure.figsize':(6, 6)})
sns.countplot(data=df, x='winner', palette='inferno').set(title='Counts of winner')
plt.xticks(rotation = 90)
plt.show()

##### We can see that the data is balanced- white's number of wins is similiar to black's number of wins

### Correlation analysis

In [None]:
grouped_df = df[['white_rating','black_rating', 'turns','rating_difference','time_control', 'winner']].groupby("winner").describe().T
grouped_df

In [None]:
df_corr = df_merge[['turns','opening_ply','victory_status_mate', 'victory_status_outoftime','victory_status_resign',
                     'white_rating', 'black_rating', 'rating_difference','time_control','winner_black', 'winner_white']]

corr_matrix = df_corr.corr()
fig, ax = plt.subplots(figsize=(10, 5))
sns.heatmap(corr_matrix, annot=True, cmap='YlGnBu')
plt.show()

It can be seen that the rating variable has an effect on the outcome of the game. The higher the player's rating, the greater the chance he has of winning.
As a result, the rating_difference variable also affects the outcome of the game. The greater the difference between the players' ratings, the higher the chance of the player with the higher rating to win.

In [None]:
# Check the relationship between numeric features to the winner:
df_corr = df_merge[['turns','white_rating','black_rating', 'opening_ply','time_control', 'rating_difference', 'winner']]
g = sns.pairplot(df_corr, hue ='winner', diag_kws={'bw': 0.2})

##### In this graph we can see how the different features divide the data into Black's victory / White's victory.

##### we can see that the features: rating difference divide the data very well

## Modeling

### Features selection

###### We want to take the features that have low cardinality -
In machine learning, "cardinality" refers to the number of unique values in a feature or column of a data set. Features with high cardinality have a large number of unique values, making them more difficult to encode and process for many machine learning algorithms.

In [None]:
## Cardinality checking:

def check_cardinality(df):
    for column in df.columns:
        cardinality = df[column].nunique()
        print("The cardinality of the feature '{}' is: {}".format(column, cardinality))
check_cardinality(df)

The features we will use: 'rated', 'victory_status', 'turns', 'white_rating', 'black_rating', 'opening_ply', 'rating_diff'

In [None]:
low_card_df = df[['rated','victory_status',
                   'turns','white_rating', 
                   'black_rating', 'opening_ply', 
                   'rating_difference']].copy()

#Process categorical features
low_card_df['rated'] = low_card_df['rated'].map({False: 0, True:1})
low_card_df = pd.get_dummies(low_card_df)

### Explanation of the indicators

#####  Recall, precision, and F1 score are three commonly used metrics to evaluate the performance of a binary classifier.
Recall - Recall is the proportion of positive instances that are correctly identified by the classifier. It measures the ability of the classifier to find all positive instances. Recall is calculated as TP / (TP + FN).

Precision - Precision is the proportion of positive instances that are correctly classified by the classifier. It measures the ability of the classifier to avoid false positive instances. Precision is calculated as TP / (TP + FP).

F1 Score - is the harmonic mean of precision and recall. It provides a single value that summarizes the precision and recall of the classifier. The F1 score is calculated as 2 * (Precision * Recall) / (Precision + Recall). A high F1 score indicates that the classifier has a good balance of precision and recall.

### Decision Trees

In [None]:
X = low_card_df
y = df['winner']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create an instance of the DecisionTreeClassifier class
dtc = tree.DecisionTreeClassifier(criterion="gini", max_depth=3, min_samples_split=100)


# fit the classifier to the data
dtc.fit(X_train, y_train)

y_pred = dtc.predict(X_test)

plt.figure(figsize=(200,100))
plot_tree(dtc, feature_names=X_train.columns, class_names=['white', 'black'], filled=True, rounded=True);
plt.show()

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test,y_pred)
plt.show()

tree_clf = tree.DecisionTreeClassifier().fit(X_train, y_train)
print(classification_report(y_true=y_test, y_pred=tree_clf.predict(X_test)))

##### We saw that out of 1954 observations, the model was able to correctly predict 1522 observations - 78%

### Logistic Regression

we want to see if the 5 first move can indicates the winner of the game. so, now we choose the rellevant columns

In [None]:
df_rel_col = df_dummies[[col for col in df_dummies.columns if col.startswith('move')]]

In [None]:
df_rel_col.drop(['moves'], axis=1, inplace=True)

In [None]:
df_rel_col.fillna(0)

In [None]:
X1 = df_rel_col
y1 = df['winner']

In [None]:
X_train1,X_test1,y_train1,y_test1 = train_test_split(X1,y1,test_size=1/3,random_state=42, stratify=y)

In [None]:
log_random_state = None
log_clf = LogisticRegression(random_state=log_random_state,max_iter=500).fit(X_train1, y_train1)
print(classification_report(y_true=y_test1, y_pred=log_clf.predict(X_test1)))
plot_confusion_matrix(log_clf, X_test1, y_test1)
plt.show()

### KMeans

In [None]:
df_clusters = df[['turns','victory_status','winner','increment_code','white_rating','black_rating','opening_pref', 'time_control','rating_difference','moves','opening_eco','opening_name','opening_ply']]

In [None]:
clusters = df_clusters[['turns','rating_difference','opening_eco','opening_pref','opening_ply','time_control']]

In [None]:
encoder = CountEncoder()

clusters[['op_name','op_eco']] = encoder.fit_transform(clusters[['opening_pref','opening_eco']])

In [None]:
scaler = StandardScaler()

features = ['turns','op_name','op_eco','opening_ply','rating_difference','time_control']
X = scaler.fit_transform(clusters[features])
X_processed = pd.DataFrame(X, columns = features)

##### choose the optimum K

In [None]:
kmeans = KMeans(random_state=0)
# Compute cluster centers and predict cluster indices
visualizer = KElbowVisualizer(kmeans, k=(2,12))
visualizer.fit(X_processed)        # Fit the data to the visualizer
visualizer.show()

##### PCA

In [None]:
pca = PCA(n_components=3,random_state=0)
pca_df = pd.DataFrame(pca.fit_transform(X_processed), columns = ['p1','p2','p3'])
   
kmeans = KMeans(n_clusters=5,random_state=0)

# Compute cluster centers and predict cluster indices

X_clustered = kmeans.fit_predict(pca_df)

In [None]:
np.unique(X_clustered)

In [None]:
clusters.describe().T

In [None]:
clusters['group'] = X_clustered
clusters['group'].value_counts()

In [None]:
cluster0 = clusters[clusters['group']==0]
cluster1 = clusters[clusters['group']==1]
cluster2 = clusters[clusters['group']==2]
cluster3 = clusters[clusters['group']==3]
cluster4 = clusters[clusters['group']==4]

In [None]:
cluster0.describe().T

##### those opening eco are common like sicilian defence:

In [None]:
cluster0['opening_eco'].value_counts()[:5]

##### 23% of the observations are in cluster0 
We can see that this group is characterized by a very high number of turns, and a very large rating difference compared to the other groups. Op_name is higher than the other groups, which shows preference for more popular openings.

In [None]:
cluster1.describe().T

In [None]:
cluster1['opening_eco'].value_counts()[:5]

##### 34% of the observations are in cluster1- the biggest group
This group does not seem to have any unique characteristics. Apart from a high value in rating difference. 

In [None]:
cluster2.describe().T

In [None]:
cluster2['opening_eco'].value_counts()[:5]

##### 25% of the observations are in cluster2 
The most represented ECO code here is A00, which is an uncommon opening
base on https://www.chessgames.com/perl/chessopening?eco=b00

In [None]:
cluster3.describe().T

In [None]:
cluster3['opening_eco'].value_counts()[:5]

##### 17% of the observations are in cluster3
This group is characterized by a very low rating difference, which means that the games in this group are very fair

In [None]:
cluster4.describe().T

##### 1% of the observations are in cluster4 - the smallest group
This group is characterized by a very low number of turns, and a very high time control

## Conclusion

After we performed various manipulations on the data, as we described in the preparation phase. We used 2 models for prediction and another model for classification. 20% of the data is test data and 80% is training data.
The first model: a decision tree - was run with different depths and it is clear that its best performance is with a depth of 3 and the parameters for the division are the ranking differences and the number of turns. Index f1 = 70%.
The second model: logistic regression - with the help of this model we tried to predict with the help of the first 5 moves whether each of them can in a certain way predict the winner of the game.
The explained variable: the winner of the game
The explanatory variables: the first 5 moves (each separately)
With the help of this model we reached a score of f1 = 55% and therefore it is not possible to rely on this model in predicting the winner and it is even considered like flipping a coin.
The third model: k-means - the goal is division into similar groups. In the initial phase, we normalized the data and performed dimensionality reduction before dividing the groups.
From this model we noticed that a division was made into 5 groups. There is a central group whose size is 34%. There is another group of players who are not experienced - it is evident that they use unfamiliar moves. Group number 3 is characterized by fair games.
Our personal recommendation is to deepen the method of determining the rating of the players because this is the most prominent variable chosen for prediction.