In [519]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sbn

# Load dataframes
awards_players = pd.read_csv('../data/awards_players.csv')
coaches = pd.read_csv('../data/coaches.csv')
players_teams = pd.read_csv('../data/players_teams.csv')
players = pd.read_csv('../data/players.csv')
series_post = pd.read_csv('../data/series_post.csv')
teams_post = pd.read_csv('../data/teams_post.csv')
teams = pd.read_csv('../data/teams.csv')

In [520]:
# Cleaning Awards Players
# Drop lgID column since all values are 'WNBA'
awards_players.drop('lgID', axis=1, inplace=True)


#Add the missing "award" value for line 30
awards_players.at[28, 'award'] = "Kim Perrot Sportsmanship Award" 
awards_players.head(10)

Unnamed: 0,playerID,award,year
0,thompti01w,All-Star Game Most Valuable Player,1
1,leslili01w,All-Star Game Most Valuable Player,2
2,leslili01w,All-Star Game Most Valuable Player,3
3,teaslni01w,All-Star Game Most Valuable Player,4
4,swoopsh01w,All-Star Game Most Valuable Player,6
5,douglka01w,All-Star Game Most Valuable Player,7
6,fordch01w,All-Star Game Most Valuable Player,8
7,cashsw01w,All-Star Game Most Valuable Player,10
8,coopemi01w,Coach of the Year,1
9,hugheda99w,Coach of the Year,2


In [521]:
# We can drop 'lgID', since all values are 'WNBA'
players_teams.drop('lgID', axis=1, inplace=True)

In [522]:
# We can drop 'lgID', since all values are 'WNBA'
teams_post.drop('lgID', axis=1, inplace=True)

In [523]:
# We can drop 'lgIDWinner and 'lgIDLoser', since all values are 'WNBA'
series_post.drop(['lgIDWinner', 'lgIDLoser'], axis=1, inplace=True)

In [524]:
# We can drop 'lgID', since all values are 'WNBA'
coaches.drop('lgID', axis=1, inplace=True)

# Rename the column 'stint' to 'stint_coach' to match the other dataframes
coaches.rename(columns={'stint': 'stint_coach'}, inplace=True)


In [525]:
# We can drop 'firstseason', since all values are '0'
players.drop('firstseason', axis=1, inplace=True)
# We can drop 'lastseason', since all values are '0'
players.drop('lastseason', axis=1, inplace=True)


# Rename the column 'bioID' to 'playerID' to match the other dataframes
players.rename(columns={'bioID': 'playerID'}, inplace=True)

# Remove players that are coaches
for index, row in players.iterrows():
    # if playerID is in coaches, remove it
    if row['playerID'] in coaches['coachID'].values:
        players.drop(index, inplace=True)

In [526]:
# We can drop 'lgID', since all values are 'WNBA'
teams.drop('lgID', axis=1, inplace=True)
# We can drop 'franchID', since values are the same as 'teamID'
teams.drop('franchID', axis=1, inplace=True)
# We can drop 'divID', since all values are null
teams.drop('divID', axis=1, inplace=True)
# We can drop 'seeded', since all values are 0
teams.drop('seeded', axis=1, inplace=True)
# We don't believe that the 'name' attribute is relevant, so we drop it
teams.drop('name', axis=1, inplace=True)
# All of these attributes are always 0, se we can simply drop them
teams.drop(['tmORB', 'tmDRB', 'tmTRB', 'opptmORB', 'opptmDRB', 'opptmTRB'], axis=1, inplace=True)
# We don't believe that the 'min' attribute is relevant, so we drop it
teams.drop('min', axis=1, inplace=True)
# We don't believe that the 'attend' attribute is relevant, so we drop it
teams.drop('attend', axis=1, inplace=True)
# We don't believe that the 'arena' attribute is relevant, so we drop it
teams.drop('arena', axis=1, inplace=True)

In [527]:
# merge the teams and players_teams dataframes
merged = pd.merge(players_teams, teams, on=[ 'year', 'tmID'])

# merge the players_teams and players dataframes
merged = pd.merge(merged, players, on='playerID')

# merge the players_teams and coaches dataframes, where year is the same
merged = pd.merge(merged, coaches, on=['tmID', 'year'])

# merge the merged and teams_post dataframes, where year is the same
merged = pd.merge(merged, teams_post, on=['tmID', 'year'])

# merge the merged and series_post dataframes, where year is the same
merged = pd.merge(merged, series_post, on='year')

# merge the merged and awards_players dataframes, where year is the same
merged = pd.merge(merged, awards_players, on=['playerID', 'year'])
merged = pd.merge(merged, awards_players, left_on=['coachID', 'year'], right_on=['playerID', 'year'], how='left', suffixes=('', '_coach'))

merged

Unnamed: 0,playerID,year,stint,tmID,GP_x,GS,minutes,points,oRebounds,dRebounds,...,L_x,round,series,tmIDWinner,tmIDLoser,W_y,L_y,award,playerID_coach,award_coach
0,arcaija01w,2,0,HOU,32,32,1154,591,49,87,...,2,FR,A,CHA,CLE,2,1,Most Improved Player,,
1,arcaija01w,2,0,HOU,32,32,1154,591,49,87,...,2,FR,B,NYL,MIA,2,1,Most Improved Player,,
2,arcaija01w,2,0,HOU,32,32,1154,591,49,87,...,2,FR,C,LAS,HOU,2,0,Most Improved Player,,
3,arcaija01w,2,0,HOU,32,32,1154,591,49,87,...,2,FR,D,SAC,UTA,2,0,Most Improved Player,,
4,arcaija01w,2,0,HOU,32,32,1154,591,49,87,...,2,CF,E,CHA,NYL,2,1,Most Improved Player,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506,wickssu01w,2,0,NYL,30,3,602,157,36,102,...,3,FR,C,LAS,HOU,2,0,Kim Perrot Sportsmanship Award,,
507,wickssu01w,2,0,NYL,30,3,602,157,36,102,...,3,FR,D,SAC,UTA,2,0,Kim Perrot Sportsmanship Award,,
508,wickssu01w,2,0,NYL,30,3,602,157,36,102,...,3,CF,E,CHA,NYL,2,1,Kim Perrot Sportsmanship Award,,
509,wickssu01w,2,0,NYL,30,3,602,157,36,102,...,3,CF,F,LAS,SAC,2,1,Kim Perrot Sportsmanship Award,,


In [530]:
# Export the cleaned dataframes to CSV files
merged.to_csv("../data/clean/merged.csv", index=False)