# Naive Bayes
---
### Why is Naive Bayes naive?
The underlying assumption is that the predictors are independent of each other. 
### Baye's Theorem
$P(C|x) = \frac{(P(C) * P(x|C))}{P(x)}$

### Gaussian Naive Bayes- adaptation for continous attribues
We can use frequencies to calculate probabilities of categoral attributes. we need to calculate the mean and variance for x in each class.

$P(x|C) = \frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{(x-u)^2}{2\sigma^2}}$

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import OrdinalEncoder

import plotly.express as px
import plotly.graph_objects as go

from sklearn.naive_bayes import GaussianNB, CategoricalNB, BernoulliNB

In [2]:
df = pd.read_csv('../data/games.csv', encoding='utf-8')

In [3]:
df['rating_difference'] = df['white_rating'] - df['black_rating']
df['white_win'] = df['winner'].apply(lambda x: 1 if x=='white' else 0)
df['match_outcome'] = df['winner'].apply(lambda x: 1 if x=='white' else 0 if x=='draw' else -1)

In [4]:
def func(X, y, typ):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    model = typ
    clf = model.fit(X_train, y_train)

    pred_labels = model.predict(X_test)

    print('Classes: ', clf.classes_)
    if str(typ)=='GaussianNB()':
        print('Class Priors: ',clf.class_prior_)
    else: 
        print('Class Log Priors: ',clf.class_log_prior_)
        
    print('--------------------------------------------------------')
    score = model.score(X_test, y_test)
    print('Accuracy Score: ', score)
    print('--------------------------------------------------------')
    
    print(classification_report(y_test, pred_labels))
    
    return X_train, X_test, y_train, y_test, clf, pred_labels

### Gaussian NB with 2 independent variables

In [5]:
X = df[['rating_difference','turns']]
y = df['white_win']

X_train, X_test , y_train, y_test, clf, pred_labels = func(X, y, GaussianNB())

Classes:  [0 1]
Class Priors:  [0.50062321 0.49937679]
--------------------------------------------------------
Accuracy Score:  0.6625124626121635
--------------------------------------------------------
              precision    recall  f1-score   support

           0       0.66      0.69      0.67      2024
           1       0.67      0.63      0.65      1988

    accuracy                           0.66      4012
   macro avg       0.66      0.66      0.66      4012
weighted avg       0.66      0.66      0.66      4012



In [6]:
mesh_size = 5
margin = 1

x_min, x_max = X.iloc[:, 0].fillna(X.mean()).min() - margin, X.iloc[:, 0].fillna(X.mean()).max() + margin
y_min, y_max = X.iloc[:, 1].fillna(X.mean()).min() - margin, X.iloc[:, 1].fillna(X.mean()).max() + margin
xrange = np.arange(x_min, x_max, mesh_size)
yrange = np.arange(y_min, y_max, mesh_size)
xx, yy = np.meshgrid(xrange, yrange)

Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)

trace_specs = [
    [X_test, y_test, 0, 'Test', 'red'],
    [X_test, y_test, 1, 'Test', 'blue']
]

fig = go.Figure(data=[
    go.Scatter(
        x=X[y==label].iloc[:, 0], y=X[y==label].iloc[:, 1],
        name=f'{split} data, Actual Class: {label}',
        mode='markers', marker_color=marker
    )
    for X, y, label, split, marker in trace_specs
])

fig.update_traces(marker_size=2, marker_line_width=0)

fig.update_xaxes(range=[-1600, 1500])
fig.update_yaxes(range=[0,345])

fig.update_layout(title_text="Decision Boundary for Naive Bayes Model", 
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1))

fig.add_trace(
    go.Contour(
        x=xrange,
        y=yrange,
        z=Z,
        showscale=True,
        colorscale='magma',
        opacity=1,
        name='Score',
        hoverinfo='skip'
    )
)

fig.show()



### Gaussian NB with 3 class labels and 2 independent variables

In [7]:
X = df[['rating_difference', 'turns']]
y =df['match_outcome'].values

X_train, X_test, y_train, y_test, clf, pred_labels = func(X, y, GaussianNB())

Classes:  [-1  0  1]
Class Priors:  [0.45232457 0.04829864 0.49937679]
--------------------------------------------------------
Accuracy Score:  0.6311066799601196
--------------------------------------------------------
              precision    recall  f1-score   support

          -1       0.63      0.61      0.62      1849
           0       0.18      0.07      0.10       175
           1       0.64      0.70      0.67      1988

    accuracy                           0.63      4012
   macro avg       0.49      0.46      0.47      4012
weighted avg       0.62      0.63      0.62      4012



### Categprial NB with 2 independent variables

In [8]:
X = df[['opening_eco', 'white_id']]
y = df['white_win'].values

enc = OrdinalEncoder()
X = enc.fit_transform(X)

X_train, X_test, y_train, y_test, clf, pred_labels = func(X, y, CategoricalNB())

Classes:  [0 1]
Class Log Priors:  [-0.69190154 -0.69439437]
--------------------------------------------------------
Accuracy Score:  0.6011964107676969
--------------------------------------------------------
              precision    recall  f1-score   support

           0       0.61      0.60      0.60      2024
           1       0.60      0.60      0.60      1988

    accuracy                           0.60      4012
   macro avg       0.60      0.60      0.60      4012
weighted avg       0.60      0.60      0.60      4012



### Bernoulli NB with 1 independent variable

In [9]:
X = df['rated'].values.reshape(-1,1)
y = df['white_win'].values

X_train, X_test, y_train, y_test, clf, pred_labels = func(X, y, CategoricalNB())

Classes:  [0 1]
Class Log Priors:  [-0.69190154 -0.69439437]
--------------------------------------------------------
Accuracy Score:  0.5019940179461615
--------------------------------------------------------
              precision    recall  f1-score   support

           0       0.50      0.80      0.62      2024
           1       0.49      0.19      0.28      1988

    accuracy                           0.50      4012
   macro avg       0.50      0.50      0.45      4012
weighted avg       0.50      0.50      0.45      4012



### Mixed NB (Gaussian + Categorical)

In [10]:
df['rating_difference_qt'] = pd.qcut(df['rating_difference'], 5, labels=['bottom 20', 'lower 20', 'middle 20', 'upper 20', 'top 20'])
df['turns_qt'] = pd.qcut(df['turns'], 5, labels=['bottom 20', 'lower 20', 'middle 20', 'upper 20', 'top 20'])

X=df[['opening_eco', 'white_id', 'rating_difference_qt', 'turns_qt']]
y=df['white_win'].values

enc = OrdinalEncoder()
X = enc.fit_transform(X)

X_train, X_test, y_train, y_test, clf, pred_labels = func(X, y, CategoricalNB())

Classes:  [0 1]
Class Log Priors:  [-0.69190154 -0.69439437]
--------------------------------------------------------
Accuracy Score:  0.6542871385842473
--------------------------------------------------------
              precision    recall  f1-score   support

           0       0.65      0.68      0.66      2024
           1       0.66      0.63      0.64      1988

    accuracy                           0.65      4012
   macro avg       0.65      0.65      0.65      4012
weighted avg       0.65      0.65      0.65      4012

