In [1]:
import pandas as pd

In [2]:
gamesAll = pd.read_csv("games.csv", index_col=0)

In [3]:
gamesAll.head()

Unnamed: 0_level_0,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
GAME_DATE_EST,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2022-12-22,22200477,Final,1610612740,1610612759,2022,1610612740,126.0,0.484,0.926,0.382,25.0,46.0,1610612759,117.0,0.478,0.815,0.321,23.0,44.0,1
2022-12-22,22200478,Final,1610612762,1610612764,2022,1610612762,120.0,0.488,0.952,0.457,16.0,40.0,1610612764,112.0,0.561,0.765,0.333,20.0,37.0,1
2022-12-21,22200466,Final,1610612739,1610612749,2022,1610612739,114.0,0.482,0.786,0.313,22.0,37.0,1610612749,106.0,0.47,0.682,0.433,20.0,46.0,1
2022-12-21,22200467,Final,1610612755,1610612765,2022,1610612755,113.0,0.441,0.909,0.297,27.0,49.0,1610612765,93.0,0.392,0.735,0.261,15.0,46.0,1
2022-12-21,22200468,Final,1610612737,1610612741,2022,1610612737,108.0,0.429,1.0,0.378,22.0,47.0,1610612741,110.0,0.5,0.773,0.292,20.0,47.0,0


In [4]:
gamesAll.shape

(26651, 20)

In [5]:
games2022=gamesAll[gamesAll["SEASON"]==2022]

In [6]:
games2022.shape

(542, 20)

In [7]:
games2022.dtypes

GAME_ID               int64
GAME_STATUS_TEXT     object
HOME_TEAM_ID          int64
VISITOR_TEAM_ID       int64
SEASON                int64
TEAM_ID_home          int64
PTS_home            float64
FG_PCT_home         float64
FT_PCT_home         float64
FG3_PCT_home        float64
AST_home            float64
REB_home            float64
TEAM_ID_away          int64
PTS_away            float64
FG_PCT_away         float64
FT_PCT_away         float64
FG3_PCT_away        float64
AST_away            float64
REB_away            float64
HOME_TEAM_WINS        int64
dtype: object

In [8]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [9]:
#sets number of decision trees to 50, use large number of trees to reduce variance as well as make model more stable
#lower value for min_samples_split captures more details in data (deeper trees), but also have more variance in model
rf=RandomForestClassifier(n_estimators=50,min_samples_split=10,random_state=1)

In [10]:
train=gamesAll[gamesAll["SEASON"]<2022]

In [11]:
test=gamesAll[gamesAll["SEASON"]==2022]

In [12]:
# clean the data
float_cols = gamesAll.select_dtypes(include=['float32']).columns

# Replace infinite values with NaN
gamesAll[float_cols] = gamesAll[float_cols].replace([np.inf, -np.inf], np.nan)

# Drop rows with any NaN values
gamesAll = gamesAll.dropna()

# Define a threshold for excessively large values 
threshold = np.finfo(np.float32).max

# Remove rows with values too large for dtype float32
gamesAll = gamesAll[(gamesAll[float_cols] < threshold).all(axis=1)]

# Identify and drop object columns, RandomForestClassifier can't handle object columns
object_cols = gamesAll.select_dtypes(include=['object']).columns
gamesAll = gamesAll.drop(columns=object_cols)

# Verify there are no NaN or infinite values left in the entire DataFrame
assert gamesAll.isnull().sum().sum() == 0, "There are still NaN values in the data"
assert np.isfinite(gamesAll).all().all(), "There are still infinite values in the data"

# Define predictors and target
# With NBA trending towards 3-point shooting, I thought 3-point percentage would be good predictor
# Rebounding helps show extra possessions which can be critical
# Points home shows offensive proficiency 
predictors = ["VISITOR_TEAM_ID", "FG3_PCT_home","REB_home","PTS_home"]
target = "HOME_TEAM_WINS"

# Ensure the columns exist in the DataFrame
assert all(col in gamesAll.columns for col in predictors), "One or more predictor columns are missing"
assert target in gamesAll.columns, "Target column is missing"

# Verify there are no NaN, infinite, or excessively large values in the predictor columns
for col in predictors:
    assert gamesAll[col].isnull().sum() == 0, f"NaN values found in column {col}"
    assert np.isfinite(gamesAll[col]).all(), f"Infinite values found in column {col}"
    assert (gamesAll[col] < threshold).all(), f"Excessively large values found in column {col}"



In [13]:
# train on seasons prior to 2021, test on 2022
# used to train on relationship between 3-point shooting, rebounds, and points and winning odds
train = gamesAll[gamesAll["SEASON"]<2021]
test = gamesAll[gamesAll["SEASON"]>2021]
rf.fit(train[predictors], train[target])


RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=1)

In [14]:
preds=rf.predict(test[predictors])

In [15]:
from sklearn.metrics import accuracy_score

In [16]:
acc=accuracy_score(test["HOME_TEAM_WINS"],preds)

In [17]:
# accurate with approximately 65 percent accuracy
acc

0.6494464944649446

In [18]:
combined = pd.DataFrame(dict(actual=test["HOME_TEAM_WINS"], predicted=preds))


In [19]:
pd.crosstab(index=combined["actual"], columns=combined["predicted"])
# when model expects team to lose, they lose 54 times and win 46 times. It is more accurate at predicting wins. When wins are predicted, correct 273 times, wrong 169 times. 

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,64,159
1,31,288


In [20]:
from sklearn.metrics import precision_score


In [21]:
precision_score(test["HOME_TEAM_WINS"], preds)
# when model predicts a team to win, it is correct 64.4% of the time

0.6442953020134228

In [22]:
group_matches=gamesAll.groupby("HOME_TEAM_ID")

In [23]:
def computeAverage(group,cols,new_cols):
        rolling_stats = group[cols].rolling(3, closed='left').mean()
        group[new_cols] = rolling_stats
        group = group.dropna(subset=new_cols)
        return group

In [24]:
cols=["PTS_home","PTS_away","FG_PCT_home","FG_PCT_away","AST_home","AST_away"]
new_cols = [f"{c}_rolling" for c in cols]



In [25]:
matches_rolling = gamesAll.groupby("HOME_TEAM_ID").apply(lambda x: computeAverage(x, cols, new_cols))


In [26]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,...,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS,PTS_home_rolling,PTS_away_rolling,FG_PCT_home_rolling,FG_PCT_away_rolling,AST_home_rolling,AST_away_rolling
HOME_TEAM_ID,GAME_DATE_EST,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1610612737,2022-12-05,22200354,1610612737,1610612760,2022,1610612737,114.0,0.431,0.765,0.342,24.0,...,0.353,25.0,49.0,0,119.000000,119.000000,0.473333,0.484667,26.333333,25.666667
1610612737,2022-12-02,22200327,1610612737,1610612743,2022,1610612737,117.0,0.557,0.643,0.357,24.0,...,0.375,31.0,33.0,1,121.000000,122.666667,0.474000,0.469000,27.000000,27.333333
1610612737,2022-11-27,22200293,1610612737,1610612748,2022,1610612737,98.0,0.398,0.947,0.278,24.0,...,0.273,28.0,46.0,0,118.000000,117.333333,0.489333,0.487333,25.333333,28.000000
1610612737,2022-11-23,22200263,1610612737,1610612758,2022,1610612737,115.0,0.446,0.957,0.367,23.0,...,0.290,27.0,48.0,1,109.666667,112.000000,0.462000,0.490000,24.000000,28.000000
1610612737,2022-11-19,22200235,1610612737,1610612761,2022,1610612737,124.0,0.500,0.781,0.280,23.0,...,0.216,27.0,51.0,1,110.000000,107.000000,0.467000,0.492667,23.666667,28.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1610612766,2014-10-29,21400004,1610612766,1610612749,2014,1610612766,108.0,0.406,0.690,0.286,27.0,...,0.500,25.0,35.0,1,95.666667,93.000000,0.427667,0.405000,23.666667,21.000000
1610612766,2014-10-23,11400107,1610612766,1610612754,2014,1610612766,79.0,0.370,0.808,0.222,18.0,...,0.417,21.0,46.0,0,91.000000,88.666667,0.398000,0.413667,22.333333,20.000000
1610612766,2014-10-15,11400053,1610612766,1610612765,2014,1610612766,84.0,0.341,0.862,0.136,15.0,...,0.400,28.0,58.0,0,85.333333,88.333333,0.379000,0.434000,20.666667,20.666667
1610612766,2014-10-13,11400043,1610612766,1610612753,2014,1610612766,99.0,0.426,0.800,0.150,24.0,...,0.357,16.0,45.0,1,90.333333,99.333333,0.372333,0.471667,20.000000,24.666667


In [27]:
gamesAll.head()

Unnamed: 0_level_0,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
GAME_DATE_EST,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2022-12-22,22200477,1610612740,1610612759,2022,1610612740,126.0,0.484,0.926,0.382,25.0,46.0,1610612759,117.0,0.478,0.815,0.321,23.0,44.0,1
2022-12-22,22200478,1610612762,1610612764,2022,1610612762,120.0,0.488,0.952,0.457,16.0,40.0,1610612764,112.0,0.561,0.765,0.333,20.0,37.0,1
2022-12-21,22200466,1610612739,1610612749,2022,1610612739,114.0,0.482,0.786,0.313,22.0,37.0,1610612749,106.0,0.47,0.682,0.433,20.0,46.0,1
2022-12-21,22200467,1610612755,1610612765,2022,1610612755,113.0,0.441,0.909,0.297,27.0,49.0,1610612765,93.0,0.392,0.735,0.261,15.0,46.0,1
2022-12-21,22200468,1610612737,1610612741,2022,1610612737,108.0,0.429,1.0,0.378,22.0,47.0,1610612741,110.0,0.5,0.773,0.292,20.0,47.0,0


In [28]:
def make_predictions(data, predictors):
    train = gamesAll[gamesAll["SEASON"]<2021]
    test = gamesAll[gamesAll["SEASON"]>2021]
    rf.fit(train[predictors], train[target])
    preds=rf.predict(test[predictors])    
    combined = pd.DataFrame(dict(actual=test["HOME_TEAM_WINS"], predicted=preds), index=test.index)
    error = precision_score(test["HOME_TEAM_WINS"], preds)
    return combined, error

In [29]:
matches_rolling.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,...,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS,PTS_home_rolling,PTS_away_rolling,FG_PCT_home_rolling,FG_PCT_away_rolling,AST_home_rolling,AST_away_rolling
HOME_TEAM_ID,GAME_DATE_EST,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1610612737,2022-12-05,22200354,1610612737,1610612760,2022,1610612737,114.0,0.431,0.765,0.342,24.0,...,0.353,25.0,49.0,0,119.0,119.0,0.473333,0.484667,26.333333,25.666667
1610612737,2022-12-02,22200327,1610612737,1610612743,2022,1610612737,117.0,0.557,0.643,0.357,24.0,...,0.375,31.0,33.0,1,121.0,122.666667,0.474,0.469,27.0,27.333333
1610612737,2022-11-27,22200293,1610612737,1610612748,2022,1610612737,98.0,0.398,0.947,0.278,24.0,...,0.273,28.0,46.0,0,118.0,117.333333,0.489333,0.487333,25.333333,28.0
1610612737,2022-11-23,22200263,1610612737,1610612758,2022,1610612737,115.0,0.446,0.957,0.367,23.0,...,0.29,27.0,48.0,1,109.666667,112.0,0.462,0.49,24.0,28.0
1610612737,2022-11-19,22200235,1610612737,1610612761,2022,1610612737,124.0,0.5,0.781,0.28,23.0,...,0.216,27.0,51.0,1,110.0,107.0,0.467,0.492667,23.666667,28.666667


In [30]:
# Drop 'level_0' column if it exists
combined = combined.drop(columns=['level_0'], errors='ignore')

# Reset index and ensure no conflict with existing columns
combined = combined.reset_index(drop=True)

# Drop 'HOME_TEAM_ID' and 'VISITOR_TEAM_ID' if they exist
combined = combined.drop(columns=['HOME_TEAM_ID', 'VISITOR_TEAM_ID'], errors='ignore')

# Reset index for matches_rolling
matches_rolling = matches_rolling.reset_index(drop=True)

# Rename columns in matches_rolling to avoid conflicts
matches_rolling = matches_rolling.rename(columns={
    "VISITOR_TEAM_ID": "VISITOR_TEAM_ID_new",
    "HOME_TEAM_ID": "HOME_TEAM_ID_new",
    "HOME_TEAM_WINS": "HOME_TEAM_WINS_new"
})

# Merge DataFrames
combined = combined.merge(
    matches_rolling[["VISITOR_TEAM_ID_new", "HOME_TEAM_ID_new", "HOME_TEAM_WINS_new"]],
    left_index=True, right_index=True
)

In [31]:
combined.head(10)


Unnamed: 0,actual,predicted,VISITOR_TEAM_ID_new,HOME_TEAM_ID_new,HOME_TEAM_WINS_new
0,1,1,1610612760,1610612737,0
1,1,1,1610612743,1610612737,1
2,1,1,1610612748,1610612737,0
3,1,1,1610612758,1610612737,1
4,0,1,1610612761,1610612737,1
5,0,1,1610612738,1610612737,0
6,1,1,1610612755,1610612737,1
7,0,1,1610612762,1610612737,0
8,0,1,1610612749,1610612737,1
9,0,0,1610612740,1610612737,1


In [32]:
# check accuracy of predictions for each team
# Purpose is to determined which teams we are predicting more accurately
# Can see that for certain teams we predict their wins much more accurately than others 
grouped = combined.groupby('VISITOR_TEAM_ID_new').agg({'actual': 'sum', 'predicted': 'sum'}).reset_index()
grouped['actual_to_predicted_ratio'] = grouped['actual'] / grouped['predicted']
print(grouped)


    VISITOR_TEAM_ID_new  actual  predicted  actual_to_predicted_ratio
0            1610612738      13         22                   0.590909
1            1610612739      13         19                   0.684211
2            1610612740       8         10                   0.800000
3            1610612741      17         20                   0.850000
4            1610612742       8         13                   0.615385
5            1610612743       7          9                   0.777778
6            1610612744       9          8                   1.125000
7            1610612745       7          9                   0.777778
8            1610612746       8         11                   0.727273
9            1610612747       8         10                   0.800000
10           1610612748      19         25                   0.760000
11           1610612749      13         22                   0.590909
12           1610612750       9         11                   0.818182
13           1610612

In [33]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 10, 20]
}

# Instantiate the grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(train[predictors], train[target])

# Best parameters and best estimator
best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_
print(f"Best parameters: {best_params}")

Best parameters: {'max_depth': 10, 'min_samples_split': 20, 'n_estimators': 50}


In [34]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score

# Initialize models
models = {
    'RandomForest': best_rf,
    'GradientBoosting': GradientBoostingClassifier(random_state=1),
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(train[predictors], train[target])
    preds = model.predict(test[predictors])
    acc = accuracy_score(test[target], preds)
    precision = precision_score(test[target], preds)
    print(f"{name} - Accuracy: {acc:.4f}, Precision: {precision:.4f}")

RandomForest - Accuracy: 0.6568, Precision: 0.6400
GradientBoosting - Accuracy: 0.6624, Precision: 0.6423


In [35]:
final_rf = RandomForestClassifier(**best_params, random_state=1)
final_rf.fit(train[predictors], train[target])

# Make predictions on the test set
final_preds = final_rf.predict(test[predictors])

# Evaluate the final model
final_acc = accuracy_score(test[target], final_preds)
final_precision = precision_score(test[target], final_preds)
print(f"Final Model - Accuracy: {final_acc:.4f}, Precision: {final_precision:.4f}")

Final Model - Accuracy: 0.6568, Precision: 0.6400


In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Now we will predict odds of one specific team beating another specific team

# Load the data
gamesAll = pd.read_csv("games.csv", index_col=0)

# Filter games for the 2022 season
games2022 = gamesAll[gamesAll["SEASON"] == 2022]

# Preprocess the data
games2022 = games2022.dropna()

# Aggregate team statistics
home_stats = games2022.groupby('HOME_TEAM_ID').agg({
    'PTS_home': 'mean', s
    'FG_PCT_home': 'mean', 
    'FT_PCT_home': 'mean', 
    'FG3_PCT_home': 'mean', 
    'AST_home': 'mean', 
    'REB_home': 'mean'
}).rename(columns=lambda x: x.replace('_home', ''))

away_stats = games2022.groupby('VISITOR_TEAM_ID').agg({
    'PTS_away': 'mean', 
    'FG_PCT_away': 'mean', 
    'FT_PCT_away': 'mean', 
    'FG3_PCT_away': 'mean', 
    'AST_away': 'mean', 
    'REB_away': 'mean'
}).rename(columns=lambda x: x.replace('_away', ''))

# Calculate points per game
ppg_home = home_stats[['PTS']]
ppg_away = away_stats[['PTS']]

# Combine home and away stats
team_stats = home_stats.add(away_stats, fill_value=0) / 2

# Calculate win percentages
home_wins = games2022.groupby('HOME_TEAM_ID')['HOME_TEAM_WINS'].mean()
away_wins = 1 - games2022.groupby('VISITOR_TEAM_ID')['HOME_TEAM_WINS'].mean()

win_percentages = pd.DataFrame({
    'TEAM_ID': home_wins.index,
    'WIN_PERCENTAGE': (home_wins + away_wins) / 2
}).set_index('TEAM_ID')

# Merge the win percentages and points per game with team stats
team_stats = team_stats.merge(win_percentages, left_index=True, right_index=True)
team_stats['PTS'] = (home_stats['PTS'] + away_stats['PTS']) / 2

# Define features and target
features = games2022[['PTS_home', 'FG_PCT_home', 'FT_PCT_home', 'FG3_PCT_home', 'AST_home', 'REB_home', 
                      'PTS_away', 'FG_PCT_away', 'FT_PCT_away', 'FG3_PCT_away', 'AST_away', 'REB_away']]
target = games2022['HOME_TEAM_WINS']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Function to predict the outcome of a new game using team IDs
def predict_game_by_team_id(home_team_id, away_team_id):
    # Fetch team statistics
    home_team_stats = team_stats.loc[home_team_id]
    away_team_stats = team_stats.loc[away_team_id]
    
    # Create a DataFrame for the new game
    new_game = pd.DataFrame({
        'PTS_home': [home_team_stats['PTS']], 'FG_PCT_home': [home_team_stats['FG_PCT']], 'FT_PCT_home': [home_team_stats['FT_PCT']], 
        'FG3_PCT_home': [home_team_stats['FG3_PCT']], 'AST_home': [home_team_stats['AST']], 'REB_home': [home_team_stats['REB']],
        'PTS_away': [away_team_stats['PTS']], 'FG_PCT_away': [away_team_stats['FG_PCT']], 'FT_PCT_away': [away_team_stats['FT_PCT']], 
        'FG3_PCT_away': [away_team_stats['FG3_PCT']], 'AST_away': [away_team_stats['AST']], 'REB_away': [away_team_stats['REB']]
    })
    
    # Predict the probabilities
    probabilities = model.predict_proba(new_game)
    home_win_prob = probabilities[0][1] * 100  # Probability of home team winning
    away_win_prob = probabilities[0][0] * 100  # Probability of away team winning

    # Print the win percentages and points per game
    print(f"Home Team Win Percentage: {home_team_stats['WIN_PERCENTAGE'] * 100:.2f}%")
    print(f"Away Team Win Percentage: {away_team_stats['WIN_PERCENTAGE'] * 100:.2f}%")
    print(f"Home Team Points Per Game: {home_team_stats['PTS']:.2f}")
    print(f"Away Team Points Per Game: {away_team_stats['PTS']:.2f}")
    
    return f"Home Team Win Probability: {home_win_prob:.2f}%, Away Team Win Probability: {away_win_prob:.2f}%"

# Example usage
result = predict_game_by_team_id(1610612747, 1610612748)  # Replace with actual team IDs
print(result)

Model Accuracy: 96.33%
Home Team Win Percentage: 37.72%
Away Team Win Percentage: 53.95%
Home Team Points Per Game: 113.11
Away Team Points Per Game: 108.70
Home Team Win Probability: 76.00%, Away Team Win Probability: 24.00%


In [37]:
print(gamesAll.iloc[:80])

                GAME_ID GAME_STATUS_TEXT  HOME_TEAM_ID  VISITOR_TEAM_ID  \
GAME_DATE_EST                                                             
2022-12-22     22200477            Final    1610612740       1610612759   
2022-12-22     22200478            Final    1610612762       1610612764   
2022-12-21     22200466            Final    1610612739       1610612749   
2022-12-21     22200467            Final    1610612755       1610612765   
2022-12-21     22200468            Final    1610612737       1610612741   
...                 ...              ...           ...              ...   
2022-12-12     22200407            Final    1610612757       1610612750   
2022-12-12     22200408            Final    1610612746       1610612738   
2022-12-11     22200395            Final    1610612740       1610612756   
2022-12-11     22200396            Final    1610612765       1610612747   
2022-12-11     22200397            Final    1610612752       1610612758   

               SEASON  T

In [38]:
team_id = 1610612747     
team_games_lakers = gamesAll[(gamesAll['HOME_TEAM_ID'] == team_id) | (gamesAll['VISITOR_TEAM_ID'] == team_id)]

In [39]:
print(team_games_lakers.iloc[:75])

                GAME_ID GAME_STATUS_TEXT  HOME_TEAM_ID  VISITOR_TEAM_ID  \
GAME_DATE_EST                                                             
2022-12-21     22200475            Final    1610612758       1610612747   
2022-12-19     22200459            Final    1610612756       1610612747   
2022-12-18     22200451            Final    1610612747       1610612764   
2022-12-16     22200437            Final    1610612747       1610612743   
2022-12-13     22200413            Final    1610612747       1610612738   
...                 ...              ...           ...              ...   
2022-01-27     22100733            Final    1610612755       1610612747   
2022-01-25     22100718            Final    1610612751       1610612747   
2022-01-23     22100700            Final    1610612748       1610612747   
2022-01-21     22100685            Final    1610612753       1610612747   
2022-01-19     22100680            Final    1610612747       1610612754   

               SEASON  T

In [85]:
#Predicted points for each team in a matchup 

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the data
gamesAll = pd.read_csv("games.csv", index_col=0)

# Filter games for the 2022 season
games2022 = gamesAll[gamesAll["SEASON"] == 2022]

# Preprocess the data
games2022 = games2022.dropna()

# Aggregate team statistics
home_stats = games2022.groupby('HOME_TEAM_ID').agg({
    'PTS_home': 'mean', 
    'FG_PCT_home': 'mean', 
    'FT_PCT_home': 'mean', 
    'FG3_PCT_home': 'mean', 
    'AST_home': 'mean', 
    'REB_home': 'mean'
}).rename(columns=lambda x: x.replace('_home', ''))

away_stats = games2022.groupby('VISITOR_TEAM_ID').agg({
    'PTS_away': 'mean', 
    'FG_PCT_away': 'mean', 
    'FT_PCT_away': 'mean', 
    'FG3_PCT_away': 'mean', 
    'AST_away': 'mean', 
    'REB_away': 'mean'
}).rename(columns=lambda x: x.replace('_away', ''))

# Combine home and away stats
team_stats = home_stats.add(away_stats, fill_value=0) / 2

# Merge the points per game with team stats
team_stats['PTS'] = (home_stats['PTS'] + away_stats['PTS']) / 2

# Define features and target for predicting home team points
features_home = games2022[['FG_PCT_home', 'FT_PCT_home', 'FG3_PCT_home', 'AST_home', 'REB_home', 
                           'FG_PCT_away', 'FT_PCT_away', 'FG3_PCT_away', 'AST_away', 'REB_away']]
target_home = games2022['PTS_home']

# Split the data into training and testing sets
X_train_home, X_test_home, y_train_home, y_test_home = train_test_split(features_home, target_home, test_size=0.2, random_state=42)

# Train the model for home team points
model_home = RandomForestRegressor(n_estimators=100, random_state=42)
model_home.fit(X_train_home, y_train_home)

# Evaluate the model for home team points
y_pred_home = model_home.predict(X_test_home)
mse_home = mean_squared_error(y_test_home, y_pred_home)
print(f"Home Points Model MSE: {mse_home:.2f}")

# Define features and target for predicting away team points
features_away = games2022[['FG_PCT_away', 'FT_PCT_away', 'FG3_PCT_away', 'AST_away', 'REB_away', 
                           'FG_PCT_home', 'FT_PCT_home', 'FG3_PCT_home', 'AST_home', 'REB_home']]
target_away = games2022['PTS_away']

# Split the data into training and testing sets
X_train_away, X_test_away, y_train_away, y_test_away = train_test_split(features_away, target_away, test_size=0.2, random_state=42)

# Train the model for away team points
model_away = RandomForestRegressor(n_estimators=100, random_state=42)
model_away.fit(X_train_away, y_train_away)

# Evaluate the model for away team points
y_pred_away = model_away.predict(X_test_away)
mse_away = mean_squared_error(y_test_away, y_pred_away)
print(f"Away Points Model MSE: {mse_away:.2f}")

# Function to predict the expected points for a new game using team IDs
def predict_points_by_team_id(home_team_id, away_team_id):
    # Fetch team statistics
    home_team_stats = team_stats.loc[home_team_id]
    away_team_stats = team_stats.loc[away_team_id]
    
    # Create a DataFrame for the new game
    new_game_home = pd.DataFrame({
        'FG_PCT_home': [home_team_stats['FG_PCT']], 'FT_PCT_home': [home_team_stats['FT_PCT']], 
        'FG3_PCT_home': [home_team_stats['FG3_PCT']], 'AST_home': [home_team_stats['AST']], 
        'REB_home': [home_team_stats['REB']],
        'FG_PCT_away': [away_team_stats['FG_PCT']], 'FT_PCT_away': [away_team_stats['FT_PCT']], 
        'FG3_PCT_away': [away_team_stats['FG3_PCT']], 'AST_away': [away_team_stats['AST']], 
        'REB_away': [away_team_stats['REB']]
    })
    
    new_game_away = pd.DataFrame({
        'FG_PCT_away': [away_team_stats['FG_PCT']], 'FT_PCT_away': [away_team_stats['FT_PCT']], 
        'FG3_PCT_away': [away_team_stats['FG3_PCT']], 'AST_away': [away_team_stats['AST']], 
        'REB_away': [away_team_stats['REB']],
        'FG_PCT_home': [home_team_stats['FG_PCT']], 'FT_PCT_home': [home_team_stats['FT_PCT']], 
        'FG3_PCT_home': [home_team_stats['FG3_PCT']], 'AST_home': [home_team_stats['AST']], 
        'REB_home': [home_team_stats['REB']]
    })
    
    # Predict the expected points
    expected_points_home = model_home.predict(new_game_home)[0]
    expected_points_away = model_away.predict(new_game_away)[0]

    return f"Expected Points: Home Team: {expected_points_home:.2f}, Away Team: {expected_points_away:.2f}"

# Example usage
result = predict_points_by_team_id(1610612747, 1610612748)  # Replace with actual team IDs
print(result)

Home Points Model MSE: 59.65
Away Points Model MSE: 54.18
Expected Points: Home Team: 112.19, Away Team: 109.71
