In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import names
from nltk.metrics.scores import precision, recall
from sklearn.metrics import precision_recall_fscore_support
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedShuffleSplit
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
def team_features(team):
    info = {}
    for i in range(1,17):
        info[f"seed: {i}"] = False
        if int(team.SEED) == i:
          info[f"seed: {i}"] = True
    info["Win/Loss ratio"] = 0 if team.W / (team.G - team.W) < 1.5 else 1
    info["2-point rate"] = 0 if (team["2P_O"]) < 55 else 1
    info["3-point rate"] = 0 if int(team["3P_O"]) < 35 else 1
    
    return info

In [None]:
df = pd.read_csv("cbb.csv")
df = df.dropna()
df['SEED'] = df['SEED'].astype(str).str.replace('$', '',regex=True).astype('float')
df['3P_O'] = df['3P_O'].astype(str).str.replace('$', '',regex=True).astype('float')
df['2P_O'] = df['2P_O'].astype(str).str.replace('$', '',regex=True).astype('float')
df['W'] = df['W'].astype(str).str.replace('$', '',regex=True).astype('float')

In [None]:
data = []
for i in range(df.TEAM.size):
    curr = df.iloc[i]
    if curr.POSTSEASON != "NA" and curr.SEED != "NA":
      oneWin = "First round out" if curr.POSTSEASON in ["R68", "R64"] else "Past first round"  
      data.append((team_features(curr), oneWin))

In [None]:
train_set, test_set = train_test_split(data, test_size=0.1, random_state=43)
print(train_set[0])

classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
y_true = list(list(zip(*test_set))[1])
test_features = list(list(zip(*test_set))[0])
y_predict = [classifier.classify(features) for features in test_features]

In [None]:
(p,r,f,s) = precision_recall_fscore_support(y_true, y_predict)
print(p,r,f,s)

classifier.show_most_informative_features(15)

In [None]:
train_set, test_set = train_test_split(data, test_size=0.2, random_state=36)

# Modify the feature matrix to have 2D array shape
X_train = [list(features.values()) for features, label in train_set]
y_train = [label for features, label in train_set]
X_test = [list(features.values()) for features, label in test_set]
y_test = [label for features, label in test_set]

clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
p,r,f,s = precision_recall_fscore_support(y_test, y_pred)
print(p, r, f)

In [None]:
df['win/loss'] = df.W / (df.G - df.W)
df1 = df[df['SEED'] == 16]
top_10 = df1.head(10)
top_10

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(15,15))

ax = plt.subplot(2,1,1)
sns.barplot(data=top_10, x="TEAM", y="3P_O")

ax = plt.subplot(2,1,2)
sns.barplot(data=top_10, x="TEAM", y="win/loss")

In [None]:
a = top_10.groupby('POSTSEASON').agg({'W':'count'}).reset_index()
b = top_10.groupby('YEAR').agg({'W':'count'}).reset_index()

fig, ax = plt.subplots(2, 1, figsize=(15,15))

ax = plt.subplot(2,1,1)
sns.barplot(data=a, x="POSTSEASON", y="W")

ax = plt.subplot(2,1,2)
sns.barplot(data=b, x="YEAR", y="W")

In [None]:
df2 = df[df['win/loss'] == 1.4285714285714286]
top_10_2 = df2.head(10)
top_10_2

In [None]:
a1 = top_10_2.groupby('POSTSEASON').agg({'W':'count'}).reset_index()
b1 = top_10_2.groupby('YEAR').agg({'W':'count'}).reset_index()

fig, ax = plt.subplots(2, 1, figsize=(15,15))

ax = plt.subplot(2,1,1)
sns.barplot(data=a1, x="POSTSEASON", y="W")

ax = plt.subplot(2,1,2)
sns.barplot(data=b1, x="YEAR", y="W")