In [1]:
import json
import numpy as np
import os
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC,LinearSVC
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

#%%
# Load data
os.chdir("/Users/jcastro2/code/sports_betting")
data = []
for num in range(2016, 2022):
	f = open("data/{}/Team_data_{}.txt".format(num, num))
	new_data = json.load(f)
	data.append(new_data)

In [2]:
#%%
def convert_scalars_to_list(game_data):
    for key in game_data.keys():
        game_data[key] = [game_data[key]]
    return game_data

#%%
# Converting game infos to proper format then to pandas dataframe
for i, season_data in enumerate(data):
    for team1 in season_data:
        for team2 in season_data[team1]:
            if team2 == "team_info":
                continue
            game_data = season_data[team1][team2]
            game_data = convert_scalars_to_list(game_data)
            data[i][team1][team2] = pd.DataFrame.from_dict(game_data)
                                                  
#%%
# Delete 'team_info' key from game infos
for i, season_data in enumerate(data):
    for team1 in season_data:
        if 'team_info' in season_data[team1].keys():
            del season_data[team1]['team_info']

#%%
# Add team1 and team2 coluumns to every dataframe
for i, season_data in enumerate(data):
    for team1 in season_data:
        for team2 in season_data[team1]:
            season_data[team1][team2]['team1'] = team1
            season_data[team1][team2]['team2'] = team2

#%%
# Add every dataframe to final dataframe
df = pd.DataFrame()
for i, season_data in enumerate(data):
    for team1 in season_data:
        for team2 in season_data[team1]:
            df = pd.concat([df, season_data[team1][team2]])

In [3]:
#%%
# Convert string columns to values
mappings = {}
#home or away for influence of game
# spread for threshold
# margin to know who won
# attendance_percent to know if crowd energy says something about the game
columns = ['home_or_away', 'spread', "margin", "team1", "team2"]
final_df = df[columns]
home_or_away_encoder=LabelEncoder()
home_or_away_encoder.fit_transform(final_df.home_or_away)
team_encoder=LabelEncoder()
team_encoder.fit_transform(pd.concat([final_df.team1, final_df.team2], axis=0))
final_df.home_or_away = final_df.home_or_away.map(dict(zip(home_or_away_encoder.classes_, home_or_away_encoder.transform(home_or_away_encoder.classes_))))
final_df.team1 = final_df.team1.map(dict(zip(team_encoder.classes_, team_encoder.transform(team_encoder.classes_))))
final_df.team2 = final_df.team2.map(dict(zip(team_encoder.classes_, team_encoder.transform(team_encoder.classes_))))

    
final_df = final_df.dropna()
y = final_df['margin']
final_df = final_df.drop(columns="margin")

print(final_df)

    home_or_away  spread  team1  team2
0              0   -52.0      0     18
0              0   -47.0      0    131
0              0   -44.0      1      2
0              1   -44.0      2      1
0              0     3.0      2     15
..           ...     ...    ...    ...
0              0     3.0    267    189
0              1   -20.0    267    234
0              0     5.5    267    241
0              0    -4.0    268     45
0              0    28.5    269    129

[9718 rows x 4 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.home_or_away = final_df.home_or_away.map(dict(zip(home_or_away_encoder.classes_, home_or_away_encoder.transform(home_or_away_encoder.classes_))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.team1 = final_df.team1.map(dict(zip(team_encoder.classes_, team_encoder.transform(team_encoder.classes_))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-doc

In [4]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

DEGREE = 2

# Create output labels y
y_binary = y - final_df['spread']
y_binary = y_binary.map(lambda x: 1 if x > 0 else 0)

# Split data
X_train,X_test,y_train,y_test=train_test_split(final_df,y_binary,test_size=0.25,random_state=0)

# Prepare train data
x_train = X_train.to_numpy()
x_train_ = PolynomialFeatures(degree=DEGREE, include_bias=False).fit_transform(x_train)
model = LogisticRegression(max_iter=4000).fit(x_train_, y_train)

# Test model
x_test = X_test.to_numpy()
x_test_ = PolynomialFeatures(degree=DEGREE, include_bias=False).fit_transform(x_test)

In [6]:
log_model = LogisticRegression(max_iter=4000).fit(x_train_, y_train)

# Test model
predictions = log_model.predict(x_test_)

# Evaluate model
cnf_matrix = metrics.confusion_matrix(y_test, predictions)

print(classification_report(y_test,log_model.predict(x_test_)))

              precision    recall  f1-score   support

           0       0.82      0.80      0.81      1179
           1       0.82      0.83      0.82      1251

    accuracy                           0.82      2430
   macro avg       0.82      0.82      0.82      2430
weighted avg       0.82      0.82      0.82      2430



In [11]:
svc_model = SVC(kernel='rbf').fit(x_train_, y_train)

# Test model
predictions = svc_model.predict(x_test_)

# Evaluate model
cnf_matrix = metrics.confusion_matrix(y_test, predictions)

print(classification_report(y_test,svc_model.predict(x_test_)))

              precision    recall  f1-score   support

           0       0.82      0.80      0.81      1179
           1       0.82      0.84      0.83      1251

    accuracy                           0.82      2430
   macro avg       0.82      0.82      0.82      2430
weighted avg       0.82      0.82      0.82      2430



from sklearn.preprocessing import PolynomialFeatures


In [8]:
lin_svc_model = LinearSVC(C=0.1).fit(x_train_, y_train)

# Test model
predictions = lin_svc_model.predict(x_test_)

# Evaluate model
cnf_matrix = metrics.confusion_matrix(y_test, predictions)

print(classification_report(y_test,lin_svc_model.predict(x_test_)))

              precision    recall  f1-score   support

           0       0.67      0.80      0.73      1179
           1       0.77      0.63      0.70      1251

    accuracy                           0.72      2430
   macro avg       0.72      0.72      0.71      2430
weighted avg       0.72      0.72      0.71      2430





In [None]:
for row in X_test.iterrows():
    new_X_train = final_df.copy()
    home_or_away = row[1][0]
    spread = row[1][1]
    team1 = row[1][2]
    team2 = row[1][3]
    new_X_train = new_X_train[new_X_train['home_or_away']==home_or_away]
    new_X_train = new_X_train[new_X_train['spread']==spread]
    new_X_train = new_X_train[new_X_train['team1']==team1]
    new_X_train = new_X_train[new_X_train['team2']==team2]

In [19]:
test = np.array([[0, -2, 18, 101], [1, 5.5, 225, 152], [0, -1.5, 162, 139], [0, 6.5, 95, 98], [1, -8, 29, 187]])
test_ = PolynomialFeatures(degree=DEGREE, include_bias=False).fit_transform(test)
predictions1 = log_model.predict(test_)
predictions2 = log_model.predict(test_)
for nums in zip(zip(predictions1, predictions2), y_test):
    print(nums)

((1, 1), 0)
((0, 0), 1)
((1, 1), 1)
((0, 0), 0)
((1, 1), 1)
