## This notebook is intended to be used with the testing_app.py script. It also will be use to further develop the testing_app.py script into a more robust application, and possible new features in the future.

In [1]:
%pip install waitress

Collecting waitress
  Using cached waitress-2.1.2-py3-none-any.whl (57 kB)
Installing collected packages: waitress
Successfully installed waitress-2.1.2
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pickle
a = pickle.load(open('production_df.pkl', 'rb'))

In [8]:
#NUMBER OF TEAMS
a['Team'].unique().shape

(30,)

In [13]:
#identify the charlotte hornets
a['Team'].unique() 

array(['BOS', 'CLE', 'HOU', 'GSW', 'MIL', 'ATL', 'DAL', 'CHO', 'DET',
       'BRK', 'IND', 'NOP', 'MEM', 'MIA', 'ORL', 'POR', 'PHO', 'SAC',
       'MIN', 'SAS', 'DEN', 'UTA', 'PHI', 'WAS', 'LAC', 'LAL', 'NYK',
       'OKC', 'CHI', 'TOR'], dtype=object)

In [1]:
import pickle
import numpy as np
import pandas as pd
import warnings

#ignore the UserWarning for the rest of the program
warnings.filterwarnings('ignore', message='X does not have valid feature names, but StandardScaler was fitted with feature names')
warnings.filterwarnings('ignore', message='X does not have valid feature names, but MinMaxScaler was fitted with feature names')


#loading training data just to test, will not be used in production
train_df = pickle.load(open('Data Analysis/data_base.pkl', 'rb'))

# load the logistic regression model
model = pickle.load(open('Data Analysis/models_2/LogisticRegression_50.pkl', 'rb'))

# load the NBA games dataset from the pickle file
games = pickle.load(open('Data Analysis/production_df.pkl', 'rb'))

# Loadings the best 50 features
best_features = pd.read_csv('Data Analysis/best_features/LogisticRegression_50.csv')

# define the information columns. We are going to take out all this information to concatenate the rows of the last n games of the 2 teams
#although, as the df is already date ordered, we are only using the 'Team' column to search for the last n games of each team. Is the col that we associate with the user input
info_cols = ['date', 'Team', 'opponent_Team', 'season', 'home', 'WIN']




def predict(home_team: str, away_team: str):
    """ Predict the winner of a NBA game
     This function takes the home and away team names as input and returns the predicted winner
      :param home_team: the name of the home team
       :param away_team: the name of the away team
        :return: the predicted winner of the game
         ---------------------------------------
          Example request:
              curl -X POST -H "Content-Type: application/json" -d '{"home_team": "BOS", "away_team": "LAL"}' http://
              Example response:
                    {"prediction": "BOS"}
         ---------------------------------------
         Information about what is happening in the function:
            1. We get the team names from the request
            2. We raise an error if the team names are not in the 3 letters format, using assert
            3. We raise an error if the team names are not all in caps, using assert
            4. We raise an error if the team names are not in the dataset
            5. We assign the team names to variables
            6. We retrieve the past 6 games stats for each team from the dataset that we loaded and is stored in the variable games
            7. We calculate the means of the past 6 games stats of team A and team B
            8. We rename the columns so it matches the training dataset, and we can properly select the features
               8.1. The columns names of the loaded dataset for the home team will be renamed to 'home_rolling_' + column name
                8.2. The columns names of the loaded dataset for the away team will be renamed to 'away_rolling_' + column name
            9. We concatenate the means of team A and team B to create the feature vector
            10. We select the best 50 features from the feature vector, using the best_features variable. This will be the input of the model.
            11. We make the prediction using the logistic regression model
             """

    # raising an error if the team names are not in the 3 letters format, using assert, returning a message to the user on the webpage:
    assert (len(home_team) == 3) and (len(away_team) == 3), 'Team name should have 3 letters format'
    # raising an error if the team names are not all in caps, using asssert
    assert (home_team.isupper()) and (away_team.isupper()), 'Team name should be in all capital letters format'

    # Raising an error if the team names are not in the dataset
    if home_team not in games['Team'].unique() or away_team not in games['Team'].unique():
        return print(f'Team name not in dataset')


    # retrieve the past 6 games stats for each team from the dataset
    home_team_games = games.loc[games['Team'] == home_team].tail(6).drop(columns=info_cols)
    away_team_games = games.loc[games['Team'] == away_team].tail(6).drop(columns=info_cols)

    # calculate the means of the past 6 games stats of team A and team B
    home_team_mean = np.mean(home_team_games, axis=0)
    away_team_mean = np.mean(away_team_games, axis=0)



    #renaming the columns so it matches the training dataset, and we can properly select the features
    home_team_mean.rename(lambda x: 'home_rolling_' + x, inplace=True)
    away_team_mean.rename(lambda x: 'away_rolling_' + x, inplace=True)

    # concatenate the means of team A and team B to create the feature vector 
    feature_vector = pd.concat([home_team_mean, away_team_mean], axis=0) #this is a series
    

    #selecting the best 50 features
    feature_vector = feature_vector[best_features.iloc[:, 0]]

    #making sure that the input features are the same as the training features that are passed to the scaler of the pipeline
    assert set(feature_vector.index) == set(model[0].feature_names_in_), "Feature names do not match"


    # make the prediction using the logistic regression model
    prediction = model.predict([feature_vector.values])[0]
    if prediction == 1:
        winner = home_team
    else:
        winner = away_team
    
    # prediction probability
    prob = model.predict_proba([feature_vector])[0][prediction]

    print(model.predict([feature_vector]))
    print(model.predict_proba([feature_vector])) 

    # return the prediction as a JSON object
    return print(f'The predicted winner is {winner} with certainty of {prob: .1%}'), feature_vector


In [77]:
# in this example, we are predicting the winner of the game between the Boston Celtics and the Los Angeles Lakers.
# the function predict() takes the home team and the away team as input, in that order
# the function returns the predicted winner and the probability of the prediction
# The function associates an output of 1 to the home team win and 0 to the away team win
# In this case, the output is 0, which means that the away team 'LAL' wins
*_, row = predict('BOS', 'LAL') # type: ignore


[0]
[[0.51825658 0.48174342]]
The predicted winner is LAL with certainty of  51.8%


In [56]:
# retrieve the past 6 games stats for each team from the dataset
home_team_games = games.loc[games['Team'] == 'BOS'].tail(6).drop(columns=info_cols)
away_team_games = games.loc[games['Team'] == 'LAL'].tail(6).drop(columns=info_cols)

# calculate the means of the past 6 games stats of team A and team B
home_team_mean = np.mean(home_team_games, axis=0)
away_team_mean = np.mean(away_team_games, axis=0)



#renaming the columns so it matches the training dataset, and we can properly select the features
home_team_mean.rename(lambda x: 'home_rolling_' + x, inplace=True)
away_team_mean.rename(lambda x: 'away_rolling_' + x, inplace=True)

# concatenate the means of team A and team B to create the feature vector 
feature_vector = pd.concat([home_team_mean, away_team_mean], axis=0) #This is a series



#selecting the best 50 features
feature_vector = feature_vector[best_features.iloc[:, 0]]

In [64]:
print(feature_vector.shape)
print(type(feature_vector))
feature_vector

(50,)
<class 'pandas.core.series.Series'>


home_rolling_ft                    14.833333
home_rolling_drb                   33.500000
home_rolling_pts                  100.833333
home_rolling_orb%                  23.416667
home_rolling_ast%                  70.666667
home_rolling_trb_max               10.166667
home_rolling_pts_max               27.500000
home_rolling_+/-_max               13.333333
home_rolling_drb%_max              66.916667
home_rolling_tov%_max              47.900000
home_rolling_ortg_max             236.833333
home_rolling_Total                100.833333
home_rolling_opponent_3pa          41.833333
home_rolling_opponent_tov          13.166667
home_rolling_opponent_pts         104.833333
home_rolling_opponent_blk%          9.400000
home_rolling_opponent_pts_max      32.833333
home_rolling_opponent_orb%_max     30.866667
home_rolling_opponent_drb%_max     38.150000
home_rolling_opponent_ast%_max     55.450000
home_rolling_opponent_stl%_max      6.400000
home_rolling_opponent_usg%_max     38.050000
home_rolli

In [74]:
model[0].transform([feature_vector])



array([[-0.85759302, -0.33463618, -1.7142657 ,  0.16097414,  1.93977403,
        -0.64064584,  0.00643906, -0.18533327,  4.09283653,  0.84164212,
         2.79031931, -1.7142657 ,  2.06145954,  0.13285769, -0.75485534,
         0.25474956,  1.83072384,  2.0853543 , -0.03027765,  2.11608715,
         0.14672694,  0.85903442,  1.47902558, -0.83319332, -0.75485534,
         3.11018402,  1.1463285 ,  0.07618472,  0.68169239, -1.05775536,
        -0.52419415,  0.52069668, -1.84089229, -0.88369878, -0.99745175,
         0.1254095 ,  0.48759554, -0.41774206,  1.3287328 , -1.16041386,
        -0.21053202,  0.        , -0.23940639, -0.78953011,  0.92946894,
         0.76981138,  0.67268361,  0.48669217, -0.44442228,  1.25293723]])

In [30]:
games

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,opponent_ast%_max,opponent_stl%_max,opponent_blk%_max,opponent_tov%_max,opponent_usg%_max,opponent_ortg_max,opponent_drtg_max,opponent_Team,opponent_Total,WIN
0,37.0,99.0,0.374,10.0,33.0,0.303,15.0,23.0,0.652,13.0,...,33.0,7.3,7.4,43.6,40.4,300.0,98.0,BRK,125,0
1,42.0,92.0,0.457,15.0,35.0,0.429,26.0,32.0,0.813,13.0,...,61.1,3.7,12.7,50.0,32.1,267.0,120.0,GSW,99,1
2,44.0,93.0,0.473,14.0,40.0,0.350,14.0,19.0,0.737,11.0,...,47.0,4.3,3.9,100.0,35.9,166.0,118.0,LAL,109,1
3,38.0,81.0,0.469,9.0,29.0,0.310,24.0,31.0,0.774,8.0,...,24.2,4.7,4.5,20.9,40.2,154.0,114.0,LAC,116,0
4,46.0,90.0,0.511,14.0,35.0,0.400,15.0,18.0,0.833,11.0,...,22.2,3.8,7.0,33.3,39.2,203.0,126.0,BOS,122,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4979,34.0,85.0,0.400,15.0,38.0,0.395,14.0,19.0,0.737,11.0,...,30.2,6.1,10.3,42.9,36.3,133.0,112.0,GSW,107,0
4980,31.0,75.0,0.413,11.0,32.0,0.344,21.0,31.0,0.677,8.0,...,59.5,5.7,7.6,33.3,36.2,222.0,107.0,GSW,104,0
4981,41.0,88.0,0.466,9.0,40.0,0.225,13.0,15.0,0.867,4.0,...,100.0,2.3,12.4,45.0,94.4,300.0,112.0,BOS,94,1
4982,38.0,92.0,0.413,19.0,46.0,0.413,8.0,8.0,1.000,15.0,...,38.7,3.9,16.0,100.0,42.6,141.0,126.0,BOS,90,1


Comparing the paramns ordering in the scaler and the created feature , to see if they match.

In [63]:
#Comparing the paramns ordering in the scaler and the created feature, to see if they match.
# For this we are going to use the function zip() which allows us to iterate over two lists at the same time
for i, j in zip(model[0].feature_names_in_, feature_vector.index):
    assert i == j, f'{i} != {j}'

# we can see that the columns names match, so the feature vector is ready to be used as input for the model trough its scaler    



In [69]:
model.predict([feature_vector])



array([0], dtype=int64)

In [70]:
feature_vector

home_rolling_ft                    14.833333
home_rolling_drb                   33.500000
home_rolling_pts                  100.833333
home_rolling_orb%                  23.416667
home_rolling_ast%                  70.666667
home_rolling_trb_max               10.166667
home_rolling_pts_max               27.500000
home_rolling_+/-_max               13.333333
home_rolling_drb%_max              66.916667
home_rolling_tov%_max              47.900000
home_rolling_ortg_max             236.833333
home_rolling_Total                100.833333
home_rolling_opponent_3pa          41.833333
home_rolling_opponent_tov          13.166667
home_rolling_opponent_pts         104.833333
home_rolling_opponent_blk%          9.400000
home_rolling_opponent_pts_max      32.833333
home_rolling_opponent_orb%_max     30.866667
home_rolling_opponent_drb%_max     38.150000
home_rolling_opponent_ast%_max     55.450000
home_rolling_opponent_stl%_max      6.400000
home_rolling_opponent_usg%_max     38.050000
home_rolli

In [16]:
import os
import pickle
import pandas as pd
test = pickle.load(open(os.path.join('Web Scraping', 'games_ids.pkl'), 'rb'))
test[-4:]


#deleting the last 4 rows of the list type object and saving it to pickle again
test = test[:-4]
pickle.dump(test, open(os.path.join('Web Scraping', 'games_ids.pkl'), 'wb'))


test[-4:]




['202304090PHO.html',
 '202304090POR.html',
 '202304090TOR.html',
 '202304090WAS.html']

In [11]:
import sqlalchemy
nba_games_db_path = os.path.join('Web Scraping', 'nba_games.db')
# creating a database
engine1 = sqlalchemy.create_engine('sqlite:///' + nba_games_db_path, echo=False, pool_pre_ping=True)
test = pd.read_sql('SELECT * FROM nba_games', con=engine1)
test.tail()

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,opponent_usg%_max,opponent_ortg_max,opponent_drtg_max,opponent_Team,opponent_Total,opponent_season,opponent_date,opponent_home,ID,WIN
14973,240.0,38.0,95.0,0.4,9.0,35.0,0.257,16.0,24.0,0.667,...,31.9,222.0,100.0,GSW,157,2022-23,2023-04-09 00:00:00.000000,0,202304090POR.html,0
14974,240.0,38.0,82.0,0.463,12.0,34.0,0.353,17.0,21.0,0.81,...,33.2,161.0,113.0,TOR,121,2022-23,2023-04-09 00:00:00.000000,1,202304090TOR.html,0
14975,240.0,48.0,95.0,0.505,11.0,33.0,0.333,14.0,16.0,0.875,...,40.7,172.0,127.0,MIL,105,2022-23,2023-04-09 00:00:00.000000,0,202304090TOR.html,1
14976,240.0,44.0,89.0,0.494,6.0,21.0,0.286,20.0,28.0,0.714,...,28.3,195.0,117.0,WAS,109,2022-23,2023-04-09 00:00:00.000000,1,202304090WAS.html,1
14977,240.0,41.0,96.0,0.427,14.0,45.0,0.311,13.0,21.0,0.619,...,27.6,136.0,117.0,HOU,114,2022-23,2023-04-09 00:00:00.000000,0,202304090WAS.html,0
