In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from datetime import datetime

In [2]:
df=pd.read_csv('Combine.csv')
df['Date'] = df['Date'].apply(lambda x: datetime.strptime(x, '%d-%b-%y'))
# Convert win/loss outcomes to binary labels (1 = Team B, 0 = Team A)
df['Result'] = np.where(df['Result'] == 'Team B', 1, 0)
df['Time_48'] = df['Time_48'].astype(str)
df['Time_48']= df['Time_48'].str.replace('0 days', '')
df['Time_48']= df['Time_48'].str.replace('.000000000', '')
def time_to_seconds(time_str):
    h, m, s = map(int, time_str.split(':'))
    return (h * 3600) + (m * 60) + s

df['Time_Sec']=df['Time_48'].apply(time_to_seconds)
### Creating Total Rebound
df['Team_A_RB']= df['Team_A_teamORB']+df['Team_A_teamDRB']+df['Team_A_ORB']+df['Team_A_DRB']
df['Team_B_RB']= df['Team_B_teamORB']+df['Team_B_teamDRB']+df['Team_B_ORB']+df['Team_B_DRB']

### NEW METRIC Points Attempted
df['Team_A_PTA']=(df['Team_A_FGA']-df['Team_A_3PA'])*2+df['Team_A_3PA']*3+df['Team_A_FTA']
df['Team_B_PTA']=(df['Team_B_FGA']-df['Team_B_3PA'])*2+df['Team_B_3PA']*3+df['Team_B_FTA']

### NEW METRIC 3P as % of all points
df['Team_A_3P%_All']=(df['Team_A_3P']*3)/(df['Team_A_Score'])
df['Team_B_3P%_All']=(df['Team_B_3P']*3)/(df['Team_B_Score'])
### Creating Difference of Stats (AST,BLK,TO,RBD)
df['diff_PT']=(df['Team_B_Score']-df['Team_A_Score'])
df['diff_AST']=(df['Team_B_AST']-df['Team_A_AST'])
df['diff_RB']=(df['Team_B_RB']-df['Team_A_RB'])
df['diff_ORB']=((df['Team_B_ORB'])+(df['Team_B_teamORB']))-((df['Team_A_ORB'])+(df['Team_A_teamORB']))
df['diff_DRB']=((df['Team_B_DRB'])+(df['Team_B_teamDRB']))-((df['Team_A_DRB'])+(df['Team_A_teamDRB']))
df['diff_ST']=(df['Team_B_ST']-df['Team_A_ST'])
df['diff_BLK']=(df['Team_B_BLK']-df['Team_A_BLK'])
df['diff_TO']=(df['Team_B_TO']-df['Team_A_TO'])
df['diff_PF']=(df['Team_B_PF']-df['Team_A_PF'])
df['diff_Tech']=(df['Team_B_Tech']-df['Team_A_Tech'])
df['diff_PTA']=(df['Team_B_PTA']-df['Team_A_PTA'])
df['diff_3P%_All']=df['Team_B_3P%_All']-df['Team_A_3P%_All']
### (0 = Away Team leading | 1= Home Team leading)
df['Home_Leading'] = np.where(df['Team_A_Score']<df['Team_B_Score'], 1, 0)

regular_season=df[df['Date']<='10-Apr-22']
playin_playoff=df[df['Date']>'10-Apr-22']

In [10]:
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Selecting Columns from data
columns=['Time_Sec','diff_PT', 'diff_AST',
       'diff_RB', 'diff_ORB', 'diff_DRB', 'diff_ST', 'diff_BLK', 'diff_TO',
       'diff_PF', 'diff_Tech','diff_PTA','Home_Leading']
# Split the data into features and labels
X = regular_season.loc[:, columns]
y = regular_season['Result']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the random forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
clf.fit(X_train, y_train)

# save the model to a file
with open('RFCmodel.pkl', 'wb') as file:
    pickle.dump(clf, file)

  if _joblib.__version__ >= LooseVersion('0.12'):


In [11]:
# Generate predictions for the test set
predictions = clf.predict(X_test)

# Get the feature names
feature_names = X.columns

# Zip the feature names and feature importance scores together
feature_importances = dict(zip(feature_names, clf.feature_importances_))

# Convert to df 
feature_importances = pd.DataFrame.from_dict(feature_importances, orient='index')

# Print the feature importance scores
feature_importances.sort_values(by=0, ascending=True)

  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


Unnamed: 0,0
diff_Tech,0.026348
diff_RB,0.052138
diff_ORB,0.053962
diff_TO,0.060319
diff_ST,0.061999
diff_BLK,0.066338
diff_PTA,0.069235
diff_DRB,0.069517
diff_PF,0.071924
Home_Leading,0.078323


In [12]:
result=X_test.copy()
result['pred']=predictions
result['actual']=y_test

In [13]:
result=result.loc[:, ['pred','actual']]
result

Unnamed: 0,pred,actual
530673,0,0
546550,0,0
10177,0,0
79700,0,0
90303,0,0
270644,0,0
516866,1,1
737752,1,1
127184,1,1
535589,0,0


In [14]:
from sklearn.metrics import confusion_matrix

#           Predicted
#           0    1
# Actual 0  TN   FP
#        1  FN   TP

cm = confusion_matrix(result['actual'],result['pred'])
print(cm)

[[71563  2216]
 [ 1790 88984]]


In [15]:
# Calculate the accuracy
accuracy = (cm[0, 0] + cm[1, 1]) / len(result['actual'])
print('Accuracy:', accuracy)
# Calculate the precision
precision = cm[1, 1] / (cm[1, 1] + cm[0, 1])
print('Precision:', precision)
# Calculate the recall
recall = cm[1, 1] / (cm[1, 1] + cm[1, 0])
print('Recall:', recall)
# Calculate the f1-score
f1 = 2 * (precision * recall)
print('F1:', f1)

Accuracy: 0.975655260007414
Precision: 0.975701754385965
Recall: 0.9802806971159143
F1: 1.9129231919333887
