In [11]:
import pandas as pd

# Load data
data = pd.read_csv('game_info_0423_sorted.csv')

# Map the season to the corresponding year
def map_season_to_year(season):
    season_str = str(season)  # Convert season to string
    return int('20' + season_str[-2:])  # Extract the last two digits and map to the year

# Apply the mapping function to create a 'Year' column
data['Year'] = data['season'].apply(map_season_to_year)

# Initialize Elo ratings for each team at the start of every year
initial_elo_rating = 1500

# Parameters
K = 20  #  rating adjustments

def expected_result(team_a, team_b):
    """Calculate the expected result of team A against team B."""
    return 1 / (1 + 10 ** ((elo_ratings[team_b] - elo_ratings[team_a]) / 400))

def update_elo(team_a, team_b, result):
    """Update Elo ratings based on match result."""
    expected_a = expected_result(team_a, team_b)
    elo_ratings[team_a] += K * (result - expected_a)
    elo_ratings[team_b] += K * ((1 - result) - (1 - expected_a))

# Store the final Elo ratings for each year
final_elo_per_year = {}

# Process each year separately
for year in sorted(data['Year'].unique()):
    # Initialize Elo ratings for the new year
    elo_ratings = {team: initial_elo_rating for team in pd.concat([data['home_team'], data['away_team']]).unique()}
    
    # Filter data for the current year
    year_data = data[data['Year'] == year]
    
    # Loop through each game in the year
    for _, row in year_data.iterrows():
        team_a = row['home_team']
        team_b = row['away_team']
        if row['home_score'] > row['away_score']:  # If home team wins
            result = 1
        else:  # If away team wins
            result = 0

        update_elo(team_a, team_b, result)
    final_elo_per_year[year] = elo_ratings.copy()

# Convert Elo ratings to a DataFrame to display
elo_df_list = []
for year, ratings in final_elo_per_year.items():
    year_df = pd.DataFrame(list(ratings.items()), columns=['Team', 'Elo Rating'])
    year_df['Year'] = year
    elo_df_list.append(year_df)

elo_df = pd.concat(elo_df_list).sort_values(by=['Year', 'Elo Rating'], ascending=[True, False])
print(elo_df)
elo_df.to_csv('elo_ratings_per_year.csv', index=False)


                      Team   Elo Rating  Year
25      Los Angeles Lakers  1660.262325  2004
12       San Antonio Spurs  1659.102653  2004
8   Minnesota Timberwolves  1638.431962  2004
14          Indiana Pacers  1624.568982  2004
19        Sacramento Kings  1615.462953  2004
..                     ...          ...   ...
24           Orlando Magic  1432.479798  2023
29       Charlotte Hornets  1377.806654  2023
12       San Antonio Spurs  1377.481781  2023
17         Houston Rockets  1366.517455  2023
6          Detroit Pistons  1330.451920  2023

[600 rows x 3 columns]


In [13]:
#average per year for elo home and away.

import pandas as pd

# Load dataset
team_stats_df = pd.read_csv('team_stats_0423_cleaned.csv')
# Group by 'Year' and 'Team' and calculate the mean of each group
team_stats_avg_df = team_stats_df.groupby(['Year', 'Team']).mean().reset_index()

team_stats_avg_df.to_csv('team_ave_per_year.csv', index=False)

team_stats_avg_df.head()


  team_stats_df = pd.read_csv('team_stats_0423_cleaned.csv')
  team_stats_avg_df = team_stats_df.groupby(['Year', 'Team']).mean().reset_index()


Unnamed: 0,Year,Team,game_id,MP,FG,FGA,FGp,3P,3PA,3Pp,...,DRBp,TRBp,ASTp,STLp,BLKp,TOVp,USGp,ORtg,DRtg,Poss
0,2004,Atlanta Hawks,30406120000.0,242.743902,34.5,79.621951,0.433756,5.109756,15.231707,0.320012,...,70.785366,49.819512,58.062195,8.053659,6.054878,15.454878,100.0,101.426829,106.5,94.541951
1,2004,Boston Celtics,30406110000.0,240.304878,34.670732,78.231707,0.444085,6.743902,19.5,0.342012,...,69.65,47.723171,59.056098,9.704878,4.980488,15.37439,100.0,102.468293,104.095122,95.339024
2,2004,Brooklyn Nets,30405940000.0,240.304878,34.304878,77.707317,0.441793,4.597561,13.695122,0.333451,...,74.067073,49.914634,70.857317,9.336585,5.102439,13.782927,100.0,101.756098,99.047561,91.263902
3,2004,Chicago Bulls,30405880000.0,241.829268,34.121951,82.353659,0.416037,5.231707,15.317073,0.346622,...,71.573171,49.141463,63.947561,8.436585,5.934146,14.908537,100.0,97.037805,103.896341,95.523902
4,2004,Cleveland Cavaliers,30406030000.0,242.134146,35.634146,82.353659,0.43461,3.012195,9.585366,0.317622,...,73.223171,51.979268,61.8,7.559756,7.946341,13.710976,100.0,101.773171,104.717073,94.441463


In [15]:


import pandas as pd

# Load the CSV file
file_path = 'team_stats_0423_cleaned.csv'
data = pd.read_csv(file_path)

# Display the first few rows to understand the structure of the data
data.head()


  data = pd.read_csv(file_path)


Unnamed: 0,Year,game_id,Team,MP,FG,FGA,FGp,3P,3PA,3Pp,...,DRBp,TRBp,ASTp,STLp,BLKp,TOVp,USGp,ORtg,DRtg,Poss
0,2004,30401020000.0,New Orleans Pelicans,240,35,83,0.422,3,12,0.25,...,90.0,61.4,37.1,5.6,1.4,14.9,100,97.8,84.1,92.36
1,2004,30401020000.0,Toronto Raptors,240,26,69,0.377,3,17,0.176,...,65.1,38.6,65.4,7.6,10.8,15.1,100,84.1,97.8,88.68
2,2004,30401020000.0,Golden State Warriors,240,31,77,0.403,8,21,0.381,...,52.9,38.3,64.5,9.4,7.6,19.6,100,86.5,106.2,92.84
3,2004,30401020000.0,Washington Wizards,240,41,92,0.446,4,10,0.4,...,79.1,61.7,73.2,15.1,7.8,16.7,100,105.1,85.6,95.92
4,2004,30401020000.0,Indiana Pacers,240,38,87,0.437,8,24,0.333,...,85.3,58.0,55.3,11.0,7.5,14.0,100,113.4,99.0,96.0


In [17]:
#Ranking class
import pandas as pd

# Load the NBA standings data
standings_df = pd.read_csv('NBA_Standings_Ranked_Updated.csv')

# Ensure that the DataFrame contains the 'Rank' column
if 'Rank' not in standings_df.columns:
    raise ValueError("The 'Rank' column is missing from the dataset.")

# Create the 'ranking_class' based on the 'Rank' column
standings_df['ranking_class'] = pd.cut(standings_df['Rank'], 
                                       bins=[0, 3, 7, 10, 15], 
                                       labels=['Top 3', 'Upper Mid', 'Lower Mid', 'Bottom'])

# Save the updated DataFrame to a new CSV file
standings_df.to_csv('NBA_Standings_Ranked_Classes.csv', index=False)

# Optional: Print the first few rows to verify
print(standings_df.head())


   Rank                  Team   W   L   W/L%           Division  Year  \
0     1       Detroit Pistons  50  32  0.610   Central Division  2003   
1     2         Brooklyn Nets  49  33  0.598  Atlantic Division  2003   
2     3    Philadelphia 76ers  48  34  0.585  Atlantic Division  2003   
3     4        Indiana Pacers  48  34  0.585   Central Division  2003   
4     5  New Orleans Pelicans  47  35  0.573   Central Division  2003   

           Conference ranking_class  
0  Eastern Conference         Top 3  
1  Eastern Conference         Top 3  
2  Eastern Conference         Top 3  
3  Eastern Conference     Upper Mid  
4  Eastern Conference     Upper Mid  


In [10]:
#Four Factors
import pandas as pd

# Load the dataset
df = pd.read_csv('team_stats_0423_sorted.csv')

# Make sure 'game_id', 'Team', and 'Year' columns exist and are correctly formatted
if 'game_id' in df.columns and 'Team' in df.columns and 'Year' in df.columns:
    
    # Create a DataFrame that maps each game_id to the opponent's DRB and includes the Year
    opponent_df = df[['Year', 'game_id', 'Team', 'DRB']].copy()
    opponent_df.rename(columns={'Team': 'Opponent', 'DRB': 'Opp_DRB'}, inplace=True)
    
    # Merge the opponent's DRB into the original DataFrame based on game_id and ensuring teams are matched correctly
    df_merged = df.merge(opponent_df, how='left', left_on=['game_id', 'Year'], right_on=['game_id', 'Year'])
    
    # Ensure that we don't match a team's DRB with itself by filtering
    df_merged = df_merged[df_merged['Team'] != df_merged['Opponent']]

    # Compute Four Factors
    df_merged['eFG%'] = (df_merged['FG'] + 0.5 * df_merged['3P']) / df_merged['FGA']  # Effective Field Goal Percentage
    df_merged['TOV%'] = df_merged['TOV'] / (df_merged['FGA'] + 0.44 * df_merged['FTA'] + df_merged['TOV'])  # Turnover Percentage
    df_merged['ORB%'] = df_merged['ORB'] / (df_merged['ORB'] + df_merged['Opp_DRB'])  # Offensive Rebounding Percentage
    df_merged['FT_Rate'] = df_merged['FT'] / df_merged['FGA']  # Free Throw Rate

    # Create a new DataFrame with only the new columns and any necessary identifiers
    four_factors_df = df_merged[['Year', 'game_id', 'Team', 'eFG%', 'TOV%', 'ORB%', 'FT_Rate']]

    # Display the first few rows of the new DataFrame to verify
    print(four_factors_df.head())

else:
    print("Required columns ('game_id', 'Team', and 'Year') are not available.")


  df = pd.read_csv('team_stats_0423_sorted.csv')


   Year       game_id                   Team      eFG%      TOV%      ORB%  \
1  2004  3.040102e+10   New Orleans Pelicans  0.439759  0.149031  0.348837   
2  2004  3.040102e+10        Toronto Raptors  0.398551  0.151057  0.100000   
5  2004  3.040102e+10  Golden State Warriors  0.454545  0.196386  0.209302   
6  2004  3.040102e+10     Washington Wizards  0.467391  0.166778  0.470588   
9  2004  3.040102e+10         Indiana Pacers  0.482759  0.140351  0.382979   

    FT_Rate  
1  0.156627  
2  0.275362  
5  0.116883  
6  0.119565  
9  0.218391  


In [11]:
import pandas as pd
import numpy as np

# Load the player stats dataset
player_stats_df = pd.read_csv('player_stats_0423_sorted.csv')

# Debug: Check the unique values in 'MP' before filtering
print("Unique values in 'MP' before filtering:")
print(player_stats_df['MP'].unique())

# Filter out rows where 'MP' contains non-numeric data (e.g., "Did not play")
# Keep only rows where 'MP' is in the format mm:ss
player_stats_df = player_stats_df[player_stats_df['MP'].str.contains(r'^\d{1,2}:\d{2}$')]

# Debug: Check the number of rows after filtering
print(f"Number of rows after filtering: {len(player_stats_df)}")

# Convert 'MP' from mm:ss to total minutes
def convert_mp_to_minutes(mp_str):
    m, s = map(int, mp_str.split(':'))
    return m + s / 60  # Convert to total minutes

# Apply the conversion function to the 'MP' column
player_stats_df['MP'] = player_stats_df['MP'].apply(convert_mp_to_minutes)

# Ensure the 'game_id' column is a string
player_stats_df['game_id'] = player_stats_df['game_id'].astype(str)

# Filter out rows where 'game_id' does not match the expected numeric format
# Assuming the correct format is all numeric characters, like '131410290001'
player_stats_df = player_stats_df[player_stats_df['game_id'].str.isnumeric()]

# Remove the re-extraction of Year from game_id, since you already have the Year column
#player_stats_df['Year'] = player_stats_df['game_id'].apply(extract_year_from_game_id)

# Ensure the other relevant columns are numeric, coercing any errors to NaN
player_stats_df['ORtg'] = pd.to_numeric(player_stats_df['ORtg'], errors='coerce')
player_stats_df['DRtg'] = pd.to_numeric(player_stats_df['DRtg'], errors='coerce')

# Assumed League Average Ratings
league_avg_ORtg = 110.0
league_avg_DRtg = 110.0

# Example value for team possessions; replace with actual data if available
team_possessions = 100 

# Compute Offensive Win Shares (OWS)
player_stats_df['OWS'] = ((player_stats_df['ORtg'] - league_avg_ORtg) / team_possessions) * player_stats_df['MP']

# Compute Defensive Win Shares (DWS)
player_stats_df['DWS'] = ((league_avg_DRtg - player_stats_df['DRtg']) / team_possessions) * player_stats_df['MP']

# Compute Total Win Shares (WS)
player_stats_df['WS'] = player_stats_df['OWS'] + player_stats_df['DWS']

# Save the updated DataFrame with Win Shares
player_stats_df.to_csv('player_stats_with_win_shares.csv', index=False)

# Display the first few rows to verify
print(player_stats_df[['player', 'Team', 'Year', 'MP', 'OWS', 'DWS', 'WS']].head())


Unique values in 'MP' before filtering:
['40:55' 'Did Not Play' 'Not With Team' ... '50:08' '55:06' '50:54']
Number of rows after filtering: 502428
            player                  Team  Year         MP       OWS       DWS  \
0      Baron Davis  New Orleans Pelicans  2004  40.916667 -2.864167  8.592500   
4     Milt Palacio       Toronto Raptors  2004  10.316667  0.103167  1.650667   
5    Michael Curry       Toronto Raptors  2004  13.683333 -9.852000  1.094667   
6     Lonny Baxter       Toronto Raptors  2004  16.833333  8.585000  2.188333   
7  Morris Peterson       Toronto Raptors  2004  24.150000 -6.279000  1.449000   

          WS  
0   5.728333  
4   1.753833  
5  -8.757333  
6  10.773333  
7  -4.830000  


In [12]:
#win share per year.
import pandas as pd

# Load the player stats dataset with Win Shares
player_stats_df = pd.read_csv('player_stats_with_win_shares.csv')

# Calculate the average Win Shares (WS) per year for each player
average_ws_per_year = player_stats_df.groupby(['player', 'Team', 'Year'])['WS'].mean().reset_index()

# Rename the WS column to indicate it's an average
average_ws_per_year.rename(columns={'WS': 'Average_WS'}, inplace=True)

# Save the result to a new CSV file
average_ws_per_year.to_csv('average_ws_per_year.csv', index=False)

# Display the first few rows of the averaged data
print(average_ws_per_year.head())


         player                    Team  Year  Average_WS
0    A.J. Green         Milwaukee Bucks  2023   -0.415129
1  A.J. Hammons        Dallas Mavericks  2017   -2.891553
2   A.J. Lawson        Dallas Mavericks  2023   -1.001190
3   A.J. Lawson  Minnesota Timberwolves  2023   -2.120000
4    A.J. Price     Cleveland Cavaliers  2015   -3.501939


In [13]:
# Assuming 'Year' column exists in the four_factors_df DataFrame
# Group by 'Team' and 'Year' and compute the mean of the four factors
team_year_avg_df = four_factors_df.groupby(['Team', 'Year']).agg({
    'eFG%': 'mean',
    'TOV%': 'mean',
    'ORB%': 'mean',
    'FT_Rate': 'mean'
}).reset_index()

# Save the averaged DataFrame to a CSV file
team_year_avg_df.to_csv('team_year_avg_four_factors.csv', index=False)

# Display the first few rows of the averaged DataFrame to verify
print(team_year_avg_df.head())


            Team  Year      eFG%      TOV%      ORB%   FT_Rate
0  Atlanta Hawks  2004  0.465506  0.154548  0.285205  0.237572
1  Atlanta Hawks  2005  0.464171  0.148496  0.303345  0.214452
2  Atlanta Hawks  2006  0.487526  0.146833  0.312619  0.259164
3  Atlanta Hawks  2007  0.471418  0.151043  0.291178  0.267488
4  Atlanta Hawks  2008  0.484019  0.139508  0.296180  0.265539
