In [23]:
import pandas as pd

# Load data
df = pd.read_csv("NBA_Standings.csv")

# Function to safely convert columns to numeric after checking if they are strings
def convert_to_numeric(column):
    if df[column].dtype == 'object':
        df[column] = pd.to_numeric(df[column].str.replace('%', '').str.strip(), errors='coerce')
    else:
        df[column] = pd.to_numeric(df[column], errors='coerce')

convert_to_numeric('W/L%')

df = df.sort_values(by=['Year', 'Conference', 'W/L%'], ascending=[True, True, False])
df['Rank'] = df.groupby(['Year', 'Conference'])['W/L%'].rank(method='first', ascending=False).astype(int)
df = df[['Rank'] + [col for col in df.columns if col != 'Rank']]
df.to_csv("NBA_Standings_Ranked.csv", index=False)
print(df.head())


   Rank                  Team   W   L   W/L%   GB  PS/G  PA/G   SRS  \
7     1      Detroit Pistons*  50  32  0.610    —  91.4  87.7  2.97   
0     2      New Jersey Nets*  49  33  0.598    —  95.4  90.1  4.42   
1     3   Philadelphia 76ers*  48  34  0.585  1.0  96.8  94.5  1.76   
8     4       Indiana Pacers*  48  34  0.585  2.0  96.8  93.3  2.79   
9     5  New Orleans Hornets*  47  35  0.573  3.0  93.9  91.8  1.52   

            Division  Year          Conference  
7   Central Division  2003  Eastern Conference  
0  Atlantic Division  2003  Eastern Conference  
1  Atlantic Division  2003  Eastern Conference  
8   Central Division  2003  Eastern Conference  
9   Central Division  2003  Eastern Conference  


In [24]:
#cleaning the Team names and removing *
import pandas as pd

# Load the NBA standings data
df = pd.read_csv("NBA_Standings_Ranked.csv")
df['Team'] = df['Team'].str.replace('*', '', regex=False).str.strip()
df.to_csv("NBA_Standings_Ranked.csv", index=False)
print(df.head())


   Rank                 Team   W   L   W/L%   GB  PS/G  PA/G   SRS  \
0     1      Detroit Pistons  50  32  0.610    —  91.4  87.7  2.97   
1     2      New Jersey Nets  49  33  0.598    —  95.4  90.1  4.42   
2     3   Philadelphia 76ers  48  34  0.585  1.0  96.8  94.5  1.76   
3     4       Indiana Pacers  48  34  0.585  2.0  96.8  93.3  2.79   
4     5  New Orleans Hornets  47  35  0.573  3.0  93.9  91.8  1.52   

            Division  Year          Conference  
0   Central Division  2003  Eastern Conference  
1  Atlantic Division  2003  Eastern Conference  
2  Atlantic Division  2003  Eastern Conference  
3   Central Division  2003  Eastern Conference  
4   Central Division  2003  Eastern Conference  


In [25]:
#looking for teams that are not updated.
import pandas as pd

# Load the NBA standings data
df = pd.read_csv("NBA_Standings_Ranked.csv")

# Remove '*' from the Team names
df['Team'] = df['Team'].str.replace('*', '', regex=False).str.strip()

# List of actual teams
actual_teams = [
    "Boston Celtics", "New York Knicks", "Philadelphia 76ers", "Brooklyn Nets", "Toronto Raptors",
    "Milwaukee Bucks", "Cleveland Cavaliers", "Indiana Pacers", "Chicago Bulls", "Detroit Pistons",
    "Orlando Magic", "Miami Heat", "Atlanta Hawks", "Charlotte Hornets", "Washington Wizards",
    "Oklahoma City Thunder", "Denver Nuggets", "Minnesota Timberwolves", "Utah Jazz", "Portland Trail Blazers",
    "Los Angeles Clippers", "Phoenix Suns", "Los Angeles Lakers", "Sacramento Kings", "Golden State Warriors",
    "Dallas Mavericks", "New Orleans Pelicans", "Houston Rockets", "Memphis Grizzlies", "San Antonio Spurs"
]

# Find any teams that are not in the list of actual teams
unknown_teams = df[~df['Team'].isin(actual_teams)]['Team'].unique()

# Display the unknown teams
print("Teams not listed in the provided actual teams list:")
print(unknown_teams)


Teams not listed in the provided actual teams list:
['New Jersey Nets' 'New Orleans Hornets' 'Seattle SuperSonics'
 'Charlotte Bobcats' 'New Orleans/Oklahoma City Hornets']


In [28]:
#Removing Unnecessary columns 

import pandas as pd

# Load the NBA standings data
df = pd.read_csv("NBA_Standings_Ranked_Updated.csv")

# Remove '*' from the Team names
df['Team'] = df['Team'].str.replace('*', '', regex=False).str.strip()

# Dictionary mapping old team names to new team names
team_name_changes = {
    "New Jersey Nets": "Brooklyn Nets",
    "New Orleans Hornets": "New Orleans Pelicans",
    "Seattle SuperSonics": "Oklahoma City Thunder",
    "Charlotte Bobcats": "Charlotte Hornets",
    "New Orleans/Oklahoma City Hornets": "New Orleans Pelicans"
}

# Apply the team name changes
df['Team'] = df['Team'].replace(team_name_changes)

# Remove unnecessary columns
columns_to_remove = ['GB', 'PS/G', 'PA/G', 'SRS']
df = df.drop(columns=columns_to_remove)

# Save the updated DataFrame to a new CSV file
df.to_csv("NBA_Standings_Ranked_Updated.csv", index=False)

# Display the first few rows of the updated DataFrame
print(df.head())


   Rank                  Team   W   L   W/L%           Division  Year  \
0     1       Detroit Pistons  50  32  0.610   Central Division  2003   
1     2         Brooklyn Nets  49  33  0.598  Atlantic Division  2003   
2     3    Philadelphia 76ers  48  34  0.585  Atlantic Division  2003   
3     4        Indiana Pacers  48  34  0.585   Central Division  2003   
4     5  New Orleans Pelicans  47  35  0.573   Central Division  2003   

           Conference  
0  Eastern Conference  
1  Eastern Conference  
2  Eastern Conference  
3  Eastern Conference  
4  Eastern Conference  


In [1]:
# Dictionary to map abbreviations to full team names
team_mapping = {
    'ATL': 'Atlanta Hawks',
    'BOS': 'Boston Celtics',
    'BRK': 'Brooklyn Nets',
    'CHO': 'Charlotte Hornets',
    'CHI': 'Chicago Bulls',
    'CLE': 'Cleveland Cavaliers',
    'DAL': 'Dallas Mavericks',
    'DEN': 'Denver Nuggets',
    'DET': 'Detroit Pistons',
    'GSW': 'Golden State Warriors',
    'HOU': 'Houston Rockets',
    'IND': 'Indiana Pacers',
    'LAC': 'Los Angeles Clippers',
    'LAL': 'Los Angeles Lakers',
    'MEM': 'Memphis Grizzlies',
    'MIA': 'Miami Heat',
    'MIL': 'Milwaukee Bucks',
    'MIN': 'Minnesota Timberwolves',
    'NOP': 'New Orleans Pelicans',
    'NYK': 'New York Knicks',
    'OKC': 'Oklahoma City Thunder',
    'ORL': 'Orlando Magic',
    'PHI': 'Philadelphia 76ers',
    'PHO': 'Phoenix Suns',
    'POR': 'Portland Trail Blazers',
    'SAC': 'Sacramento Kings',
    'SAS': 'San Antonio Spurs',
    'TOR': 'Toronto Raptors',
    'UTA': 'Utah Jazz',
    'WAS': 'Washington Wizards'
}

team_mapping_df = pd.DataFrame(list(team_mapping.items()), columns=['Abbreviation', 'Full_Name'])
team_mapping_df.to_csv("team_mapping_list.csv", index=False)
print("Team mapping saved to team_mapping_list.csv:")
print(team_mapping_df)


Team mapping saved to team_mapping_list.csv:
   Abbreviation               Full_Name
0           ATL           Atlanta Hawks
1           BOS          Boston Celtics
2           BRK           Brooklyn Nets
3           CHO       Charlotte Hornets
4           CHI           Chicago Bulls
5           CLE     Cleveland Cavaliers
6           DAL        Dallas Mavericks
7           DEN          Denver Nuggets
8           DET         Detroit Pistons
9           GSW   Golden State Warriors
10          HOU         Houston Rockets
11          IND          Indiana Pacers
12          LAC    Los Angeles Clippers
13          LAL      Los Angeles Lakers
14          MEM       Memphis Grizzlies
15          MIA              Miami Heat
16          MIL         Milwaukee Bucks
17          MIN  Minnesota Timberwolves
18          NOP    New Orleans Pelicans
19          NYK         New York Knicks
20          OKC   Oklahoma City Thunder
21          ORL           Orlando Magic
22          PHI      Philadelphia 7

# Cleaning game_info,  team_stats, and player stats

In [6]:
#adding Year column

# Load dataset
game_info_df = pd.read_csv('game_info_0423.csv')

# Function to extract the year from the 'season' column
def extract_year_from_season(season):
    year_str = str(season)[-2:] 
    year = int(year_str)
    return 2000 + year

# Apply the function to create a new 'Year' column
game_info_df['Year'] = game_info_df['season'].apply(extract_year_from_season)

# Move the 'Year' column to the first position
columns_order = ['Year'] + [col for col in game_info_df.columns if col != 'Year']
game_info_df = game_info_df[columns_order]
game_info_df.to_csv('game_info_0423_with_year.csv', index=False)
print(game_info_df.head())


   Year       game_id  season        date away_team  away_score home_team  \
0  2014  131410290001    1314  2013-10-29       ORL          87       IND   
1  2014  131410290002    1314  2013-10-29       CHI          95       MIA   
2  2014  131410290003    1314  2013-10-29       LAC         103       LAL   
3  2014  131410300004    1314  2013-10-30       BRK          94       CLE   
4  2014  131410300005    1314  2013-10-30       BOS          87       TOR   

   home_score  result  
0          97       1  
1         107       1  
2         116       1  
3          98       1  
4          93       1  


In [7]:

# Load datasets
game_info_df = pd.read_csv('game_info_0423.csv')
player_stat_df = pd.read_csv('player_stats_0423.csv')

# Function to extract the year from the 'season' column in game_info_df
def extract_year_from_season(season):
    year_str = str(season)[-2:] 
    year = int(year_str)
    return 2000 + year

# Apply the function to create a 'Year' column in game_info_df
game_info_df['Year'] = game_info_df['season'].apply(extract_year_from_season)

# Merge the Year into player_stat_df based on matching 'game_id'
player_stat_df = player_stat_df.merge(game_info_df[['game_id', 'Year']], on='game_id', how='left')

# Move the 'Year' column to the first position
columns_order = ['Year'] + [col for col in player_stat_df.columns if col != 'Year']
player_stat_df = player_stat_df[columns_order]
player_stat_df.to_csv('player_stats_0423_with_year.csv', index=False)
print(player_stat_df.head())


   Year       game_id            player team     MP   FG   FGA    FGp   3P  \
0  2014  131410290001     Arron Afflalo  ORL  32:59  3.0  14.0  0.214  1.0   
1  2014  131410290001    Nikola Vučević  ORL  30:39  4.0  11.0  0.364  0.0   
2  2014  131410290001     Jameer Nelson  ORL  30:34  4.0  13.0  0.308  3.0   
3  2014  131410290001     Jason Maxiell  ORL  26:19  0.0   5.0  0.000  0.0   
4  2014  131410290001  Maurice Harkless  ORL  23:30  6.0  13.0  0.462  2.0   

   3PA  ...  TS%  eFG%  ORB%  DRB%  TRB%  AST%  STL%  BLK%  TOV%  USG%  
0  5.0  ...  NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
1  0.0  ...  NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
2  7.0  ...  NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
3  0.0  ...  NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
4  2.0  ...  NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  

[5 rows x 55 columns]


In [22]:
#checking for duplicate
import pandas as pd

# Load the dataset
game_info_df = pd.read_csv('game_info_0423_with_year.csv')

# Check for duplicates across all columns
duplicates = game_info_df[game_info_df.duplicated(keep=False)]

# Display the duplicate rows
if not duplicates.empty:
    print("Duplicate rows found:")
    print(duplicates)
else:
    print("No duplicate rows found.")



Duplicate rows found:
       Year       game_id  season        date away_team  away_score home_team  \
20645  2011  101110260001    1011  2010-10-26       MIA          80       BOS   
20646  2011  101110260002    1011  2010-10-26       PHO          92       POR   
20647  2011  101110260003    1011  2010-10-26       HOU         110       LAL   
20648  2011  101110270004    1011  2010-10-27       BOS          87       CLE   
20649  2011  101110270005    1011  2010-10-27       NYK          98       TOR   
20650  2011  101110270006    1011  2010-10-27       MIA          97       PHI   
20651  2011  101110270007    1011  2010-10-27       DET          98       NJN   
20652  2011  101110270008    1011  2010-10-27       CHI          95       OKC   
20653  2011  101110270009    1011  2010-10-27       MIL          91       NOH   
20654  2011  101110270010    1011  2010-10-27       SAC         117       MIN   
20655  2011  101110270011    1011  2010-10-27       ATL         119       MEM   
20656 

In [8]:
#remove duplicates
import pandas as pd

# Load the dataset
game_info_df = pd.read_csv('game_info_0423_with_year.csv')
player_stats_df = pd.read_csv('player_stats_0423_with_year.csv')
team_stats_df = pd.read_csv('team_stats_0423.csv')

# Remove duplicate rows
game_info_df_cleaned = game_info_df.drop_duplicates()
player_stats_df_cleaned = player_stats_df.drop_duplicates()
team_stats_df_cleaned = team_stats_df.drop_duplicates()

# Save the cleaned dataset
game_info_df_cleaned.to_csv('game_info_0423_cleaned.csv', index=False)
player_stats_df_cleaned.to_csv('player_stats_0423_cleaned.csv', index=False)
team_stats_df_cleaned.to_csv('team_stats_0423_cleaned.csv', index=False)


  team_stats_df = pd.read_csv('team_stats_0423.csv')


In [24]:
#checking for duplicate again
import pandas as pd

# Load the dataset
game_duplicate = pd.read_csv('game_info_0423_cleaned.csv')
player_duplicate = pd.read_csv('player_stats_0423_cleaned.csv')
team_duplicate = pd.read_csv('team_stats_0423_cleaned.csv')

# Check for duplicates across all columns
Gdup = game_duplicate[game_duplicate.duplicated(keep=False)]
Pdup = player_duplicate[player_duplicate.duplicated(keep=False)]
Tdup=  player_duplicate[player_duplicate.duplicated(keep=False)]

# Display the duplicate rows
if not Gdup.empty:
    print("Duplicate rows found:")
    print(duplicates)
else:
    print("No duplicate rows found.")

# Display the duplicate rows
if not Pdup.empty:
    print("Duplicate rows found:")
    print(duplicates)
else:
    print("No duplicate rows found.")

# Display the duplicate rows
if not Tdup.empty:
    print("Duplicate rows found:")
    print(duplicates)
else:
    print("No duplicate rows found.")

  team_duplicate = pd.read_csv('team_stats_0423_cleaned.csv')


No duplicate rows found.
No duplicate rows found.
No duplicate rows found.


In [10]:
player_info_df_sorted = pd.read_csv('player_stats_0423_cleaned.csv')
# Rename the 'team' column to 'Team'
player_info_df_sorted.rename(columns={'team': 'Team'}, inplace=True)

# Save the updated dataset with the new header
player_info_df_sorted.to_csv('player_stats_0423_cleaned.csv', index=False)


In [11]:
#Updated the teams.
# Define the mapping dictionary
team_name_mapping = {
    "NJN": "BRK",
    "NOH": "NOP",
    "SEA": "OKC",
    "CHA": "CHO",
    "NOK": "NOP"
}

# Load the datasets
player_stats_df = pd.read_csv('player_stats_0423_cleaned.csv')
team_stats_df = pd.read_csv('team_stats_0423_cleaned.csv')
game_info_df = pd.read_csv('game_info_0423_cleaned.csv')

# Update team names in player_stats_0423_cleaned.csv
if 'Team' in player_stats_df.columns:
    player_stats_df['Team'] = player_stats_df['Team'].replace(team_name_mapping)

# Update team names in team_stats_0423_cleaned.csv
if 'Team' in team_stats_df.columns:
    team_stats_df['Team'] = team_stats_df['Team'].replace(team_name_mapping)

# Update team names in game_info_0423_cleaned.csv for home_team and away_team
if 'home_team' in game_info_df.columns:
    game_info_df['home_team'] = game_info_df['home_team'].replace(team_name_mapping)
if 'away_team' in game_info_df.columns:
    game_info_df['away_team'] = game_info_df['away_team'].replace(team_name_mapping)

# Save the updated datasets back to the CSV files
player_stats_df.to_csv('player_stats_0423_cleaned.csv', index=False)
team_stats_df.to_csv('team_stats_0423_cleaned.csv', index=False)
game_info_df.to_csv('game_info_0423_cleaned.csv', index=False)

print("Team names updated successfully.")


  team_stats_df = pd.read_csv('team_stats_0423_cleaned.csv')


Team names updated successfully.


In [12]:
#mapping to the complete team name
import pandas as pd

# Load the team mapping file
team_mapping_df = pd.read_csv("team_mapping_list.csv")

# Convert the mapping DataFrame into a dictionary for easier lookup
team_mapping_dict = dict(zip(team_mapping_df['Abbreviation'], team_mapping_df['Full_Name']))

# Load the datasets that need to be updated
player_stats_df = pd.read_csv('player_stats_0423_cleaned.csv')
team_stats_df = pd.read_csv('team_stats_0423_cleaned.csv')
game_info_df = pd.read_csv('game_info_0423_cleaned.csv')

# Update the 'Team' column in player_stats_0423_cleaned.csv
if 'Team' in player_stats_df.columns:
    player_stats_df['Team'] = player_stats_df['Team'].replace(team_mapping_dict)

# Update the 'Team' column in team_stats_0423_cleaned.csv
if 'Team' in team_stats_df.columns:
    team_stats_df['Team'] = team_stats_df['Team'].replace(team_mapping_dict)

# Update the 'home_team' and 'away_team' columns in game_info_0423_cleaned.csv
if 'home_team' in game_info_df.columns:
    game_info_df['home_team'] = game_info_df['home_team'].replace(team_mapping_dict)
if 'away_team' in game_info_df.columns:
    game_info_df['away_team'] = game_info_df['away_team'].replace(team_mapping_dict)

# Save the updated datasets back to their respective CSV files
player_stats_df.to_csv('player_stats_0423_cleaned.csv', index=False)
team_stats_df.to_csv('team_stats_0423_cleaned.csv', index=False)
game_info_df.to_csv('game_info_0423_cleaned.csv', index=False)

print("Team names updated successfully using the full names from team_mapping_list.csv.")


  team_stats_df = pd.read_csv('team_stats_0423_cleaned.csv')


Team names updated successfully using the full names from team_mapping_list.csv.


# Checking Data

In [13]:
player_stats_df.head()

Unnamed: 0,Year,game_id,player,Team,MP,FG,FGA,FGp,3P,3PA,...,TS%,eFG%,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%
0,2014,131410290001,Arron Afflalo,Orlando Magic,32:59,3.0,14.0,0.214,1.0,5.0,...,,,,,,,,,,
1,2014,131410290001,Nikola Vučević,Orlando Magic,30:39,4.0,11.0,0.364,0.0,0.0,...,,,,,,,,,,
2,2014,131410290001,Jameer Nelson,Orlando Magic,30:34,4.0,13.0,0.308,3.0,7.0,...,,,,,,,,,,
3,2014,131410290001,Jason Maxiell,Orlando Magic,26:19,0.0,5.0,0.0,0.0,0.0,...,,,,,,,,,,
4,2014,131410290001,Maurice Harkless,Orlando Magic,23:30,6.0,13.0,0.462,2.0,2.0,...,,,,,,,,,,


In [14]:
team_stats_df.head()

Unnamed: 0,Year,game_id,Team,MP,FG,FGA,FGp,3P,3PA,3Pp,...,DRBp,TRBp,ASTp,STLp,BLKp,TOVp,USGp,ORtg,DRtg,Poss
0,2014,131410000000.0,Orlando Magic,240,36,93,0.387,9,19,0.474,...,72.2,47.0,47.2,10.5,8.5,14.9,100,91.6,102.1,101.4
1,2014,131410000000.0,Indiana Pacers,240,34,71,0.479,7,17,0.412,...,72.3,53.0,50.0,3.9,19.4,19.0,100,102.1,91.6,95.08
2,2014,131410000000.0,Chicago Bulls,240,35,83,0.422,7,26,0.269,...,85.7,50.6,65.7,11.3,5.6,16.2,100,97.9,110.2,100.12
3,2014,131410000000.0,Miami Heat,240,37,72,0.514,11,20,0.55,...,76.1,49.4,70.3,10.0,8.4,17.5,100,110.2,97.9,97.76
4,2014,131410000000.0,Los Angeles Clippers,240,41,83,0.494,8,21,0.381,...,62.5,43.5,65.9,10.3,4.3,14.7,100,102.6,115.6,99.12


In [15]:
game_info_df.head()

Unnamed: 0,Year,game_id,season,date,away_team,away_score,home_team,home_score,result
0,2014,131410290001,1314,2013-10-29,Orlando Magic,87,Indiana Pacers,97,1
1,2014,131410290002,1314,2013-10-29,Chicago Bulls,95,Miami Heat,107,1
2,2014,131410290003,1314,2013-10-29,Los Angeles Clippers,103,Los Angeles Lakers,116,1
3,2014,131410300004,1314,2013-10-30,Brooklyn Nets,94,Cleveland Cavaliers,98,1
4,2014,131410300005,1314,2013-10-30,Boston Celtics,87,Toronto Raptors,93,1


In [16]:
import pandas as pd

# Load the player_stats_0423_cleaned.csv dataset
player_stats_df = pd.read_csv('player_stats_0423_cleaned.csv')

# List of corresponding % and p columns
columns_map = {
    'FG%': 'FGp',
    '3P%': '3Pp',
    'FT%': 'FTp',
    'TS%': 'TSp',
    'eFG%': 'eFGp',
    'ORB%': 'ORBp',
    'DRB%': 'DRBp',
    'TRB%': 'TRBp',
    'AST%': 'ASTp',
    'STL%': 'STLp',
    'BLK%': 'BLKp',
    'TOV%': 'TOVp',
    'USG%': 'USGp'
}

# Move values from % columns to p columns where p columns are NaN
for perc_col, p_col in columns_map.items():
    player_stats_df[p_col] = player_stats_df[p_col].combine_first(player_stats_df[perc_col])

# Drop the % columns after moving the data
player_stats_df.drop(columns=list(columns_map.keys()), inplace=True)

# Save the updated dataset to a new file
player_stats_df.to_csv('player_stats_0423_cleaned_updated.csv', index=False)


In [17]:
# Display the first few rows to verify
player_stats_df.head()

Unnamed: 0,Year,game_id,player,Team,MP,FG,FGA,FGp,3P,3PA,...,STLp,BLKp,TOVp,USGp,ORtg,DRtg,BPM,PIE,GmSc,+/-
0,2014,131410290001,Arron Afflalo,Orlando Magic,32:59,3.0,14.0,0.214,1.0,5.0,...,0.0,0.0,6.1,20.8,69.0,111.0,-9.9,-1.1,,
1,2014,131410290001,Nikola Vučević,Orlando Magic,30:39,4.0,11.0,0.364,0.0,0.0,...,3.3,2.9,31.3,21.9,70.0,98.0,-5.4,3.8,,
2,2014,131410290001,Jameer Nelson,Orlando Magic,30:34,4.0,13.0,0.308,3.0,7.0,...,3.3,0.0,13.0,21.2,102.0,101.0,5.9,10.3,,
3,2014,131410290001,Jason Maxiell,Orlando Magic,26:19,0.0,5.0,0.0,0.0,0.0,...,1.9,6.8,28.6,11.2,8.0,99.0,-10.6,-1.9,,
4,2014,131410290001,Maurice Harkless,Orlando Magic,23:30,6.0,13.0,0.462,2.0,2.0,...,2.2,0.0,0.0,24.0,111.0,107.0,2.4,2.7,,


In [18]:
import pandas as pd

player_stats_df.to_csv('player_stats_0423_cleaned_updated.csv', index=False)
team_stats_df.to_csv('team_stats_0423_cleaned.csv', index=False)
game_info_df.to_csv('game_info_0423_cleaned.csv', index=False)


# Load the CSV files into DataFrames
nba_standings_df = pd.read_csv("NBA_Standings_Ranked_Updated.csv")
team_stats_df = pd.read_csv("team_stats_0423_cleaned.csv")
player_stats_df = pd.read_csv("player_stats_0423_cleaned.csv")
game_info_df = pd.read_csv("game_info_0423_cleaned.csv")

# Display the data types of each DataFrame
print("Data types for NBA_Standings_Ranked_Updated.csv:")
print(nba_standings_df.dtypes)
print("\n")

print("Data types for team:")
print(team_stats_df.dtypes)
print("\n")

print("Data types for player")
print(player_stats_df.dtypes)
print("\n")

print("Data types for game")
print(game_info_df.dtypes)


  team_stats_df = pd.read_csv("team_stats_0423_cleaned.csv")


Data types for NBA_Standings_Ranked_Updated.csv:
Rank            int64
Team           object
W               int64
L               int64
W/L%          float64
Division       object
Year            int64
Conference     object
dtype: object


Data types for team:
Year         int64
game_id    float64
Team        object
MP           int64
FG           int64
FGA          int64
FGp        float64
3P           int64
3PA          int64
3Pp        float64
FT           int64
FTA          int64
FTp        float64
ORB          int64
DRB          int64
TRB          int64
AST          int64
STL          int64
BLK          int64
TOV          int64
PF           int64
PTS          int64
PM         float64
TSp        float64
eFGp        object
3PAr       float64
FTr        float64
ORBp       float64
DRBp       float64
TRBp       float64
ASTp       float64
STLp       float64
BLKp       float64
TOVp       float64
USGp         int64
ORtg       float64
DRtg       float64
Poss       float64
dtype: object




In [20]:
#arrange the sequence of the games.
import pandas as pd

# Load the cleaned dataset
game_info_df = pd.read_csv('game_info_0423_cleaned.csv')
player_stats_df = pd.read_csv('player_stats_0423_cleaned_updated.csv')
team_stats_df = pd.read_csv('team_stats_0423_cleaned.csv')

# Sort the data by game_id in ascending order
game_info_df_sorted = game_info_df.sort_values(by='game_id')
player_info_df_sorted = player_stats_df.sort_values(by='game_id')
team_info_df_sorted = team_stats_df.sort_values(by='game_id')

# Save the sorted dataset
game_info_df_sorted.to_csv('game_info_0423_sorted.csv', index=False)
player_info_df_sorted.to_csv('player_stats_0423_sorted.csv', index=False)
team_info_df_sorted.to_csv('team_stats_0423_sorted.csv', index=False)


  team_stats_df = pd.read_csv('team_stats_0423_cleaned.csv')


In [22]:
# Display the first few rows to verify
player_info_df_sorted.head()

Unnamed: 0,Year,game_id,player,Team,MP,FG,FGA,FGp,3P,3PA,...,STLp,BLKp,TOVp,USGp,ORtg,DRtg,BPM,PIE,GmSc,+/-
316503,2004,30401020451,Baron Davis,New Orleans Pelicans,40:55,10.0,25.0,0.4,2.0,6.0,...,0.0,0.0,3.7,29.8,103.0,89.0,6.8,11.7,12.8,12.0
316526,2004,30401020451,Jérôme Moïso,Toronto Raptors,Did Not Play,,,,,,...,,,,,,,,,,
316525,2004,30401020451,Roger Mason,Toronto Raptors,Did Not Play,,,,,,...,,,,,,,,,,
316524,2004,30401020451,Robert Archibald,Toronto Raptors,Not With Team,,,,,,...,,,,,,,,,,
316523,2004,30401020451,Milt Palacio,Toronto Raptors,10:19,1.0,3.0,0.333,0.0,0.0,...,5.0,0.0,17.4,27.1,111.0,94.0,6.6,4.5,4.9,-4.0
