In [19]:
import json
import numpy as np
import os
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC,LinearSVC
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

#%%
# Load data
os.chdir("/Users/jcastro2/code/ml/sports_betting")
data = []
for num in range(2016, 2022):
	f = open("data/{}/Team_data_{}.txt".format(num, num))
	new_data = json.load(f)
	data.append(new_data)

In [20]:
#%%
def convert_scalars_to_list(game_data):
    for key in game_data.keys():
        game_data[key] = [game_data[key]]
    return game_data

#%%
# Converting game infos to proper format then to pandas dataframe
for i, season_data in enumerate(data):
    for team1 in season_data:
        for team2 in season_data[team1]:
            if team2 == "team_info":
                continue
            game_data = season_data[team1][team2]
            game_data = convert_scalars_to_list(game_data)
            data[i][team1][team2] = pd.DataFrame.from_dict(game_data)
                                                  
#%%
# Delete 'team_info' key from game infos
for i, season_data in enumerate(data):
    for team1 in season_data:
        if 'team_info' in season_data[team1].keys():
            del season_data[team1]['team_info']

#%%
# Add team1 and team2 coluumns to every dataframe
for i, season_data in enumerate(data):
    for team1 in season_data:
        for team2 in season_data[team1]:
            season_data[team1][team2]['team1'] = team1
            season_data[team1][team2]['team2'] = team2

#%%
# Add every dataframe to final dataframe
df = pd.DataFrame()
for i, season_data in enumerate(data):
    for team1 in season_data:
        for team2 in season_data[team1]:
            df = pd.concat([df, season_data[team1][team2]])

In [21]:
#%%
# Convert string columns to values
mappings = {}
#home or away for influence of game
# spread for threshold
# margin to know who won
# attendance_percent to know if crowd energy says something about the game
columns = ['home_or_away', 'spread', "margin", "team1", "team2"]
final_df = df[columns]
home_or_away_encoder=LabelEncoder()
home_or_away_encoder.fit_transform(final_df.home_or_away)
team_encoder=LabelEncoder()
team_encoder.fit_transform(pd.concat([final_df.team1, final_df.team2], axis=0))
final_df.home_or_away = final_df.home_or_away.map(dict(zip(home_or_away_encoder.classes_, home_or_away_encoder.transform(home_or_away_encoder.classes_))))
final_df.team1 = final_df.team1.map(dict(zip(team_encoder.classes_, team_encoder.transform(team_encoder.classes_))))
final_df.team2 = final_df.team2.map(dict(zip(team_encoder.classes_, team_encoder.transform(team_encoder.classes_))))

    
final_df = final_df.dropna()
y = final_df['margin']
final_df = final_df.drop(columns="margin")

print(final_df)

    home_or_away  spread  team1  team2
0              0   -52.0      0     18
0              0   -47.0      0    131
0              0   -44.0      1      2
0              1   -44.0      2      1
0              0     3.0      2     15
..           ...     ...    ...    ...
0              0     3.0    267    189
0              1   -20.0    267    234
0              0     5.5    267    241
0              0    -4.0    268     45
0              0    28.5    269    129

[9718 rows x 4 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.home_or_away = final_df.home_or_away.map(dict(zip(home_or_away_encoder.classes_, home_or_away_encoder.transform(home_or_away_encoder.classes_))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.team1 = final_df.team1.map(dict(zip(team_encoder.classes_, team_encoder.transform(team_encoder.classes_))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-doc

In [22]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

DEGREE = 2

# Create output labels y
y_binary = y - final_df['spread']
y_binary = y_binary.map(lambda x: 1 if x > 0 else 0)

# Split data
X_train,X_test,y_train,y_test=train_test_split(final_df,y_binary,test_size=0.25,random_state=0)

# Prepare train data
x_train = X_train.to_numpy()
x_train_ = PolynomialFeatures(degree=DEGREE, include_bias=False).fit_transform(x_train)
model = LogisticRegression(max_iter=4000).fit(x_train_, y_train)

# Test model
x_test = X_test.to_numpy()
x_test_ = PolynomialFeatures(degree=DEGREE, include_bias=False).fit_transform(x_test)

In [23]:
model_log = LogisticRegression(max_iter=4000).fit(x_train_, y_train)

# Test model
predictions = model_log.predict(x_test_)

# Evaluate model
cnf_matrix = metrics.confusion_matrix(y_test, predictions)

print(classification_report(y_test,model_log.predict(x_test_)))

              precision    recall  f1-score   support

           0       0.82      0.80      0.81      1179
           1       0.82      0.83      0.82      1251

    accuracy                           0.82      2430
   macro avg       0.82      0.82      0.82      2430
weighted avg       0.82      0.82      0.82      2430



In [24]:
model_svc = SVC(kernel='rbf').fit(x_train_, y_train)

# Test model
predictions = model_svc.predict(x_test_)

# Evaluate model
cnf_matrix = metrics.confusion_matrix(y_test, predictions)

print(classification_report(y_test,model_svc.predict(x_test_)))

              precision    recall  f1-score   support

           0       0.82      0.80      0.81      1179
           1       0.82      0.84      0.83      1251

    accuracy                           0.82      2430
   macro avg       0.82      0.82      0.82      2430
weighted avg       0.82      0.82      0.82      2430



from sklearn.preprocessing import PolynomialFeatures


In [62]:
model_linear_svc = LinearSVC(C=0.1).fit(x_train_, y_train)

# Test model
predictions = model_linear_svc.predict(x_test_)

# Evaluate model
cnf_matrix = metrics.confusion_matrix(y_test, predictions)

print(classification_report(y_test,model_linear_svc.predict(x_test_)))

              precision    recall  f1-score   support

           0       0.67      0.88      0.76      1179
           1       0.84      0.59      0.69      1251

    accuracy                           0.73      2430
   macro avg       0.76      0.74      0.73      2430
weighted avg       0.76      0.73      0.73      2430





In [None]:
import io
import uvicorn
import numpy as np
import nest_asyncio
from enum import Enum
from fastapi import FastAPI, HTTPException
from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse

# Assign an instance of the FastAPI class to the variable "app".
# You will interact with your api using this instance.
app = FastAPI(title='Deploying a ML Model with FastAPI')

class Model(str, Enum):
    model1 = 'Logistic Regression Classifier'
    model2= 'SVM Classifier'
    model3 = 'Linear SVM Classifier'
    
    
class HomeOrAway(str, Enum):
    home = 'home'
    away= 'away'

# By using @app.get("/") you are allowing the GET method to work for the / endpoint.
@app.get("/")
def home():
    return "Welcome to the sports betting predictor. Now head over to http://localhost:8000/docs."

# This endpoint handles all the logic necessary for the object detection to work.
# It requires the desired model and the image in which to perform object detection.
@app.post("/predict") 
def prediction(model: Model, home_or_away: HomeOrAway, spread: float, team1: str, team2: str):

    # Encode data
    df = pd.DataFrame({'home_or_away':[home_or_away], 'spread':[spread], 'team1':[team1], 'team2':[team2]})
    df.home_or_away = df.home_or_away.map(dict(zip(home_or_away_encoder.classes_, home_or_away_encoder.transform(home_or_away_encoder.classes_))))
    df.team1 = df.team1.map(dict(zip(team_encoder.classes_, team_encoder.transform(team_encoder.classes_))))
    df.team2 = df.team2.map(dict(zip(team_encoder.classes_, team_encoder.transform(team_encoder.classes_))))

    x_predict = df.to_numpy()
    x_predict_ = PolynomialFeatures(degree=DEGREE, include_bias=False).fit_transform(x_predict)
    
    # 3. RUN PREDICTION MODEL
    if model == 'Logistic Regression Classifier':
        prediction = model_log.predict(x_predict_)
    elif model == 'SVM Classifier':
        prediction = model_svc.predict(x_predict_)
    elif model == 'Linear SVM Classifier':
        prediction = model_linear_svc.predict(x_predict_)
    
    print(prediction)
    if prediction == 0:
        prediction = team_encoder.inverse_transform([i for i in range(len(team_encoder.classes_))])[df.team1.tolist()[0]]
    else:
        prediction = team_encoder.inverse_transform([i for i in range(len(team_encoder.classes_))])[df.team2.tolist()[0]]
        
    # 4. RETURN PREDICTION
    return prediction

# Allows the server to be run in this interactive environment
nest_asyncio.apply()

# Host depends on the setup you selected (docker or virtual env)
host = "127.0.0.1"

# Spin up the server!    
uvicorn.run(app, host=host, port=8000)

In [54]:
x_test, y_test

(array([[  1. ,   6.5,  84. , 142. ],
        [  1. , -24.5,  79. ,  45. ],
        [  1. , -19.5,  89. ,   4. ],
        ...,
        [  1. ,  15. ,  38. , 217. ],
        [  0. ,  -3.5,  64. , 200. ],
        [  1. ,   4. , 248. , 178. ]]),
 0    0
 0    1
 0    1
 0    0
 0    1
     ..
 0    1
 0    0
 0    0
 0    1
 0    0
 Length: 2430, dtype: int64)

In [58]:
list(zip(team_encoder.classes_, team_encoder.transform(team_encoder.classes_)))[45]

('CONN', 45)