In [None]:
# source: https://www.manning.com/liveproject/how-to-think-about-manipulating-data

import pandas as pd

# 1. Reading in and Cleaning the Data
url="https://en.wikipedia.org/wiki/NBA_All-Star_Game"
table = pd.read_html(url,header=0,index_col=0,parse_dates=True)
print(type(table)) # <class 'list'> # array of DataFrame
print(len(table)) # 16
print(type(table[2])) # <class 'pandas.core.frame.DataFrame'> # table[2] is already a DataFrame
print(table[2].shape) # (74, 4)
df = table[2].dropna(how='any')
print(df.shape) # (71, 4) # 1951 - 2021; the last three rows (2022 2023 2024) are dropped because there are empty cells
df

In [None]:
# 2. Keeping Only Data of Interest
def parse_east(line):
    parts = line.split(",")
    parts0 = parts[0].split()
    parts1 = parts[1].split()
    if parts0[0].lower() == "east":
        return int(parts0[1])
    elif parts1[0].lower() == "east":
        return int(parts1[1])

def parse_west(line):
    parts = line.split(",")
    parts0 = parts[0].split()
    parts1 = parts[1].split()
    if parts0[0].lower() == "west":
        return int(parts0[1])
    elif parts1[0].lower() == "west":
        return int(parts1[1])

df_east = df.loc[:,"Result"] # pick a column
df_east = df_east.apply(parse_east) # transform every cell
df_west = df.loc[:,"Result"]
df_west = df_west.apply(parse_west)
del(df["Result"]) # delete a column
del(df["Host arena"])
del(df["Game MVP"])
df.insert(0,"West",df_west) # add a column
df.insert(0,"East",df_east)
# df = df.rename(index=lambda s: s.split("[")[0]) # not useful?
df = df.dropna(how='any') # 5 rows dropped: 2018 - 2021 with names not like "east" "west"; 1999 with canceled info
df["Host city"] = df["Host city"].apply(lambda s: s.strip().split(",")[0])
df # 66 x 3

In [None]:
# 3. Getting counts
df['Diff'] = abs(df['East'] - df['West']) # calculating and adding a "Diff" column at the same time
df.to_csv("nba.csv")

dfd = df.groupby('Diff').size().sort_values(ascending=False)
# print(dfd) # 5.0 - 7; ... 43.0 - 1 # count of each diff # not used anywhere else

## question 1. What is the maximum and minimum score difference between all games?
maxdiff = max(df["Diff"])
print("Max difference:" + str(maxdiff))
print(df.loc[df["Diff"] == maxdiff]) # filtering rows on condition
mindiff = min(df["Diff"])
print()
print("Min difference:" + str(mindiff))
print(df.loc[df["Diff"] == mindiff])
print()
dfd

In [None]:
# 4. Filtering and Grouping Data
dfg = df.groupby('Host city').mean() # mean() = average
# print(dfg) # Host city (index) / East / West / Diff
size = df.groupby('Host city').size()
dfg["Count"] = size # adding a "Count" column to the DF
dfg = dfg.sort_values('Count')
dfg = dfg[dfg['Count'] > 1]

## question 2. If a city hosted a game more than once, what was the average score 
## for the Eastern Conference team and the Western Conference team? 
print("Cities who hosted the game more than once:")
dfg

In [None]:
# end