# Data Analysis

In [None]:
from nba.games import data_view, games_clean, team_stats, home_away_count
from nba.games_plots import bar_plot_simple, home_away_plots
from nba.game_info import data_view_info, game_info_clean, day_pd, month_pd, year_pd
from nba.game_info_plots import day_chart, month_chart, line_year

# Game dataset
* This dataset contains information on NBA games played since 1946, including their results and scores.  
* We are interested in determining whether the data indicate that home teams are more likely to win a game compared to away teams.  
* We will also gather insights on the performance of individual teams

In [None]:
#Quick look at uncleaned data. It is included for reference
data_games = data_view()
data_games.head()


In [None]:
#Cleaned data, containing a marker for the winner of the game. We are also only interested in games played in the regular season
clean_table = games_clean(data_games)
clean_table

In [None]:
# Win and Loss stats for each team
team_data = team_stats(clean_table)
team_data

In [None]:
# Celtics has the most wins out of all NBA teams since 1946. However, we cannot say Celtics is the most successful NBA team given teams were founded in different years 
print(team_data.iloc[0])

In [None]:
# We will now chart the top 30 teams by the number of wins they have achieved in the games.
# Celtics have the highest number of games and well above otgers. Lakers abd Knicks have similar number of wins 
# There is a relatively small gap between teams in the middle, so there is more intense competition in the mid-tier
bar = bar_plot_simple(team_data)


In [None]:
# We repeat the exercise for the last 10 years, as we suspect that teams which have been around longer will have an artificially inflated aggregate number of wins.
last10yr = clean_table[clean_table['Game Date'].dt.year >= 2013] #filtering for games happened on and after 2014
team_data10yr = team_stats(last10yr)
team_data10yr.head(15)

In [None]:
# As we suspect, Celtics is not the team that has the most wins in the last 10 years, but Golden State Warriors is
print(team_data10yr.iloc[0])

In [None]:
# And the competition has been very fierce. Celtics is the 3rd highest winning team in the past since 2013
bar2 = bar_plot_simple(team_data10yr)

In [None]:
# As we would suspect, the data indicates home teams have recieved more wins compared to away teams in NBA games
home_stats_data = home_away_count(clean_table)
home_stats_data

In [None]:
# We will not filter the data by year here, given we are looking at the behaviour of two groups, instead of individual trends
# As we can see clearly from the graph, Home teams have a clear advantage for winning, compared to away teams in a given game
type_chart= home_away_plots(home_stats_data)

## Game info: 
* In this part, we want to check three ideas: 
* whether attendance at an NBA event is affected by the day of the week on which it takes place.  
* whether attendance at an NBA event is affected by the day of the month on which it takes place. 
* whether attendance has changed over time

In [None]:
#Quick look at the cleaned data. We have added columns for the day, month and year
# on which the game has taken place
game_info = data_view_info()
clean_game_info = game_info_clean(game_info)
clean_game_info.head()

In [None]:
# Given it might be more likely to have games scheduled on a Friday than a Tuesday, 
# we should use mean as our aggregate measure, instead of sum
data_agg = day_pd(clean_game_info)
print(data_agg)

# Very interestingly, on average maxiumum attendance happens on a Monday
max_day = data_agg.loc[data_agg['Attendance'].idxmax(), 'Day']
print("Day with the most attendance on average is: " + str(max_day))

In [None]:
# Attendance by day:
# However, as we see, audiences do not particularly favour one day over other. 
# Equally, avg. attendance on Mondays and Fridays are close
fig_d = day_chart(data_agg)

In [None]:
# The regular season runs from October to April 
# however we can see that there were a few number of 
# off season games played in summer months, excluding in Aug
data_agg_m = month_pd(clean_game_info)
print(data_agg_m)

# Again, very interestingly June sees the most attendance to games
#although it might make sense as it is summer holidays, it is not main season
max_month = data_agg_m.loc[data_agg_m['Attendance'].idxmax(), 'Month']
print("Month with the most attendance on average is: " + str(max_month))

In [None]:
# Attendance by month:
# June is clearly the most popular month for NBA games, followed by September and July
fig_m = month_chart(data_agg_m)

In [None]:
# We are also interested to see if attendance has changed over the years. 
# We first aggregate the data by summing attendance over years
annual_data = year_pd(clean_game_info)
annual_data

In [None]:
# Then we plot the data to spot any visible trends
# Data clearly does not show a linear trend, it almost looks like a logistic function
# We can see the decline in data during the years of COVID, where attendance has dropped sharply
year_chart = line_year(annual_data)