# 05. Data Exploration (identify patterns, trends and use statistical analysis)

## Age vs points

### Prepare data

Let's start with a simple question and find the correlation between players' age and points for the last 40 seasons.

Import the libraries and load the .env-file:

In [1]:
# Import the standard libraries.
import os

# Import the third party libraries.
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import psycopg
import seaborn as sns
from scipy.stats import ttest_ind
from sqlalchemy import create_engine
import sqlite3
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

# Import the local/project packages, modules, and fucntions.
from utils.data_exploration_p3 import (
    compare_players
)
# Set environment.
load_dotenv()

True

Create the sqlalchemy engine for the both databases to avoid the warning "pandas only supports SQLAlchemy connectable":

In [2]:
# Create the engine for PostgreSQL
postgresql_url = (
    f"postgresql+psycopg://{os.environ.get('user')}:{os.environ.get('password')}@"
    f"{os.environ.get('host')}:{os.environ.get('port')}/nba"
)
p_engine = create_engine(postgresql_url)
# Create the enging for SQLIte (sqlite can work with pandas direcly but one engine 
# does the code more readable.
s_engine = create_engine("sqlite:////Users/lex/Sync/AI/DB/NBA/nba.sqlite")

Retrieve the stasistical player data:

In [3]:
p_query = """
      WITH player_career AS (
    SELECT s.player_id,
    	   EXTRACT (YEAR FROM MIN(date)) AS from_year,
    	   EXTRACT (YEAR FROM MAX(date)) AS to_year
      FROM stats AS s
      JOIN games AS g
        ON s.game_id = g.id
      JOIN players AS p
        ON s.player_id = p.id
      JOIN teams AS t
        ON s.team_id = t.id
     GROUP BY s.player_id, p.first_name, p.last_name)
    SELECT s.player_id,
    	   p.first_name,
    	   p.last_name,
    	   t.abbreviation,
    	   t.full_name,
    	   p.position,
       	   g.season,
    	   s.ast,
    	   s.blk,
    	   s.dreb,
    	   s.fg3_pct,
    	   s.fg3a,
    	   s.fg3m,
    	   s.fg_pct,
    	   s.fga,
    	   s.fgm,
    	   s.ft_pct,
    	   s.fta,
    	   s.ftm,
    	   s.min,
    	   s.oreb,
    	   s.pf,
    	   s.pts,
    	   s.reb,
    	   s.stl,
    	   s.turnover,
    	   s.team_id,
    	   s.game_id,
    	   g.date,
    	   pc.from_year,
    	   pc.to_year
      FROM stats AS s
      JOIN games AS g
        ON s.game_id = g.id
      JOIN players AS p
        ON s.player_id = p.id
      JOIN teams AS t
        ON s.team_id = t.id
      JOIN player_career AS pc
        ON s.player_id = pc.player_id
     WHERE g.season BETWEEN '1983' AND '2023';
"""
df_stats = pd.read_sql_query(p_query, p_engine)

In [4]:
df_stats

Unnamed: 0,player_id,first_name,last_name,abbreviation,full_name,position,season,ast,blk,dreb,...,pf,pts,reb,stl,turnover,team_id,game_id,date,from_year,to_year
0,2551,Edgar,Jones,SAS,San Antonio Spurs,,1984,,,,...,,4.0,,,,27,42232,1984-12-14,1980.0,1986.0
1,2872,Eddie,Johnson,SAC,Sacramento Kings,,1984,,,,...,,27.0,,,,26,43690,1985-01-19,1981.0,1999.0
2,2872,Eddie,Johnson,SAC,Sacramento Kings,,1984,,,,...,,6.0,,,,26,40314,1985-02-07,1981.0,1999.0
3,2872,Eddie,Johnson,SAC,Sacramento Kings,,1984,,,,...,,25.0,,,,26,43975,1985-01-05,1981.0,1999.0
4,538,Mark,West,DAL,Dallas Mavericks,,1983,1.0,0.0,1.0,...,5.0,4.0,3.0,0.0,2.0,7,42212,1984-04-15,1983.0,2000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
771247,416,Pascal,Siakam,TOR,Toronto Raptors,F,2022,8.0,0.0,9.0,...,2.0,25.0,14.0,0.0,0.0,28,858399,2023-03-16,2016.0,2023.0
771248,413,Collin,Sexton,UTA,Utah Jazz,G,2022,2.0,0.0,3.0,...,1.0,22.0,3.0,1.0,3.0,29,857496,2022-11-06,2018.0,2023.0
771249,416,Pascal,Siakam,TOR,Toronto Raptors,F,2022,6.0,2.0,3.0,...,2.0,28.0,4.0,0.0,2.0,28,858209,2023-02-12,2016.0,2023.0
771250,4197356,Leandro,Bolmaro,UTA,Utah Jazz,F,2022,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,29,858027,2023-01-18,2021.0,2023.0


Add the "name" column to the dataframe:

In [5]:
df_stats["name"] = df_stats["first_name"] + " " + df_stats["last_name"]
df_stats.head(5)

Unnamed: 0,player_id,first_name,last_name,abbreviation,full_name,position,season,ast,blk,dreb,...,pts,reb,stl,turnover,team_id,game_id,date,from_year,to_year,name
0,2551,Edgar,Jones,SAS,San Antonio Spurs,,1984,,,,...,4.0,,,,27,42232,1984-12-14,1980.0,1986.0,Edgar Jones
1,2872,Eddie,Johnson,SAC,Sacramento Kings,,1984,,,,...,27.0,,,,26,43690,1985-01-19,1981.0,1999.0,Eddie Johnson
2,2872,Eddie,Johnson,SAC,Sacramento Kings,,1984,,,,...,6.0,,,,26,40314,1985-02-07,1981.0,1999.0,Eddie Johnson
3,2872,Eddie,Johnson,SAC,Sacramento Kings,,1984,,,,...,25.0,,,,26,43975,1985-01-05,1981.0,1999.0,Eddie Johnson
4,538,Mark,West,DAL,Dallas Mavericks,,1983,1.0,0.0,1.0,...,4.0,3.0,0.0,2.0,7,42212,1984-04-15,1983.0,2000.0,Mark West


Create a dataframe from csv-file:

In [6]:
df_player_info = pd.read_csv("csv/basketball_reference_com/df_player_info_20230823_120400.csv", parse_dates=["birth_date"])
df_player_info = df_player_info[df_player_info["from_year"] > 1982]

Group, find duplicates, choose a few columns, and create a dataframe of duplicate players:

In [7]:
df_stats_grouped = (
    df_stats
    .groupby(["player_id"])
    .agg({"name": "first", "from_year": "first", "to_year": "first"})
    .reset_index()
)
df_stats_duplicates = df_stats_grouped[df_stats_grouped.duplicated(subset=["name"], keep=False)]
df1 = df_stats_duplicates[["player_id", "name", "from_year", "to_year"]].sort_values(by="name")
df1["from_year"] = df1["from_year"].astype(int)
df1["to_year"] = df1["to_year"].astype(int)
df1

Unnamed: 0,player_id,name,from_year,to_year
3356,18042321,Asante Gist,2021,2021
3355,18041884,Asante Gist,2021,2021
1706,1711,Bobby Jones,2006,2008
2248,2277,Bobby Jones,1976,1986
3377,24489167,Brandon Williams,2021,2022
...,...,...,...,...
2034,2040,Tony Mitchell,2013,2014
2405,2638,Walker Russell,1982,1988
1961,1967,Walker Russell,2012,2012
3354,18039569,Xavier Rathan-Mayes,2021,2021


Create a list of unique names for duplicate athletes (that have the same first and last names):

In [8]:
names = df1["name"].unique().tolist()
names

['Asante Gist',
 'Bobby Jones',
 'Brandon Williams',
 'Cameron McGriff',
 'Cedric Henderson',
 'Charles Jones',
 'Charles Smith',
 'Chris Johnson',
 'Chris Smith',
 'Chris Wright',
 'Dee Brown',
 'Deonte Burton',
 'Derrick Alston',
 'Dru Smith',
 "Ed O'Bannon",
 'Eddie Johnson',
 'George Johnson',
 'Gerald Henderson',
 'Glen Rice',
 'JT Thor',
 'Jaime Echenique',
 'Jamorko Pickett',
 'John Coker',
 'Johnny Davis',
 'Jordan Goodwin',
 'Josh Gray',
 'Justin Jackson',
 'Justin Robinson',
 'Keaton Wallace',
 'Ken Johnson',
 'Kenny Williams',
 'Loy Vaught',
 'Malcolm Hill',
 'Marcus Williams',
 'Mark Davis',
 'Mark Jones',
 'Micah Potter',
 'Michael Smith',
 'Mike Dunleavy',
 'Mike James',
 'Mike Smith',
 'Olivier Sarr',
 'Patrick Ewing',
 'Reggie Williams',
 'Robert Werdann',
 'Steven Smith',
 'Tony Mitchell',
 'Walker Russell',
 'Xavier Rathan-Mayes']

Add the person_id columns, clear the name column, choose a few column, and create df2:

In [9]:
df_player_info = df_player_info.reset_index().rename(columns={"index": "person_id"})
df_player_info["name"] = df_player_info["name"].str.rstrip("*")
df2 = df_player_info[["person_id", "name", "from_year", "to_year"]]
df2

Unnamed: 0,person_id,name,from_year,to_year
0,0,Alaa Abdelnaby,1991,1995
1,3,Mahmoud Abdul-Rauf,1991,2001
2,4,Tariq Abdul-Wahad,1998,2003
3,5,Shareef Abdur-Rahim,1997,2008
4,9,Álex Abrines,2017,2019
...,...,...,...,...
3065,5101,Stephen Zimmerman,2017,2017
3066,5102,Paul Zipser,2017,2018
3067,5103,Ante Žižić,2018,2020
3068,5104,Jim Zoet,1983,1983


Now, we are going to join the df1 and df2. These tables do not have a commond id key. So, we will use the name, to_year and from_year columns.

However, there are some problems:
- Not only two but more players can have the same first and last name.
- We use the data from different sources. In some cases, the data about from_year and to_year columns (when a player was active) are different.
- The same player can have a slighlly different names in different bases, for example, Steve and Steven.
- In some cause, when a player played in different leagues (not only NBA), he can have different ids. In this case, it is not duplicate, it is the same person.
- Some players have only a few games and information about them can be dropped for our goal.
- Information about some players can be missed. Then amount of players in one source does not equal amount of players in another source.

So, it is quite chalenging to automize this process because there is much abmbiquitious. Therefore, we will do it manually. It is not very time consuming because we have about 150 cases when we have to decide how to connect players from different sources.  

To simplify the task we will use [the compare_players function](utils/data_exploration_p3.py) (it can be copied and pasted or saved as a csv-file). Also, the additional data from [basketball-reference.com](https://www.basketball-reference.com/) can be used for comparision.

In [10]:
compare_players(df1, df2, names)

Asante Gist
Bobby Jones
[1711, 'Bobby Jones', 2006, 2008, 2319, 'Bobby Jones', 2007, 2008]
[2277, 'Bobby Jones', 1976, 1986, 2319, 'Bobby Jones', 2007, 2008]
Brandon Williams
[24489167, 'Brandon Williams', 2021, 2022, 4890, 'Brandon Williams', 1998, 2003]
[24489167, 'Brandon Williams', 2021, 2022, 4891, 'Brandon Williams', 2022, 2022]
[1145, 'Brandon Williams', 1998, 2003, 4890, 'Brandon Williams', 1998, 2003]
[1145, 'Brandon Williams', 1998, 2003, 4891, 'Brandon Williams', 2022, 2022]
Cameron McGriff
[24489241, 'Cameron McGriff', 2021, 2021, 2984, 'Cameron McGriff', 2022, 2022]
[17895707, 'Cameron McGriff', 2021, 2021, 2984, 'Cameron McGriff', 2022, 2022]
Cedric Henderson
[1110, 'Cedric Henderson', 1997, 2002, 1925, 'Cedric Henderson', 1987, 1987]
[1110, 'Cedric Henderson', 1997, 2002, 1926, 'Cedric Henderson', 1998, 2002]
[2761, 'Cedric Henderson', 1986, 1987, 1925, 'Cedric Henderson', 1987, 1987]
[2761, 'Cedric Henderson', 1986, 1987, 1926, 'Cedric Henderson', 1998, 2002]
Charles Jo

Use the name to compare data in two dataframes:

In [11]:
inp_name = "Glen Rice"

In [12]:
df1[df1["name"] == inp_name]

Unnamed: 0,player_id,name,from_year,to_year
2032,2038,Glen Rice,2013,2014
505,508,Glen Rice,1989,2004


In [13]:
df2[df2["name"] == inp_name]

Unnamed: 0,person_id,name,from_year,to_year
2276,3791,Glen Rice,1990,2004


In [14]:
(df_stats[(df_stats["first_name"] == inp_name.split()[0]) & (df_stats["last_name"] == inp_name.split()[1])]
 .groupby("player_id")
 .agg({
         "first_name": "first", 
         "last_name": "first", 
         "from_year": "first", 
         "to_year": "first", 
         "pts": "sum",
         "player_id": "size"})
)

Unnamed: 0_level_0,first_name,last_name,from_year,to_year,pts,player_id
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
508,Glen,Rice,1989.0,2004.0,12861.0,730
2038,Glen,Rice,2013.0,2014.0,35.0,23


After manual analysys, save the result as a csv-file and convert it to dataframe:

In [15]:
df_manual = pd.read_csv(
    filepath_or_buffer="csv/manually_filtered_table.csv", 
    delimiter=";", 
    usecols=list(range(1,10))
)
df_manual

Unnamed: 0,player_id,name,from_year_x,to_year_x,person_id,from_year_y,to_year_y,status,comment
0,18042321,Asante Gist,2021,2021,,,,drop,no data
1,18041884,Asante Gist,2021,2021,,,,drop,no data
2,1711,Bobby Jones,2006,2008,2319.0,2007.0,2008.0,correct,to_year are equal
3,1711,Bobby Jones,2006,2008,2318.0,1975.0,1986.0,not correct,to_year are not equal
4,2277,Bobby Jones,1976,1986,2319.0,2007.0,2008.0,not correct,to_year are not equal
...,...,...,...,...,...,...,...,...,...
152,2638,Walker Russell,1982,1988,3957.0,2012.0,2012.0,not correct,to_year are not equal
153,1967,Walker Russell,2012,2012,3956.0,1983.0,1988.0,not correct,to_year are not equal
154,1967,Walker Russell,2012,2012,3957.0,2012.0,2012.0,correct,to_year are equal
155,18039569,Xavier Rathan-Mayes,2021,2021,3732.0,2018.0,2018.0,drop,only 4 or 2 games


Filter out the players with correct status. Person_id and player_id will be use later to join tables.

In [16]:
df_manual[df_manual["status"] == "correct"].reset_index()

Unnamed: 0,index,player_id,name,from_year_x,to_year_x,person_id,from_year_y,to_year_y,status,comment
0,2,1711,Bobby Jones,2006,2008,2319.0,2007.0,2008.0,correct,to_year are equal
1,5,2277,Bobby Jones,1976,1986,2318.0,1975.0,1986.0,correct,to_year are equal
2,6,24489167,Brandon Williams,2021,2022,4891.0,2022.0,2022.0,correct,to_year are equal
3,9,1145,Brandon Williams,1998,2003,4890.0,1998.0,2003.0,correct,to_year are equal
4,12,1110,Cedric Henderson,1997,2002,1926.0,1998.0,2002.0,correct,to_year are equal
5,15,2761,Cedric Henderson,1986,1987,1925.0,1987.0,1987.0,correct,to_year are equal
6,18,592,Charles Jones,1984,1998,2322.0,1984.0,1998.0,correct,"to_year are equal, 2 x 3"
7,19,2698,Charles Jones,1984,1989,2323.0,1985.0,1989.0,correct,"to_year are equal, 2 x 3"
8,23,1128,Charles Smith,1997,2006,4193.0,1998.0,2006.0,correct,to_year are equal
9,25,2906,Charles Smith,1988,1997,4191.0,1989.0,1997.0,correct,to_year are equal


Create a table of ids, that help us to join dataframes and tables from different sources:

In [17]:
df_ids = df_manual[df_manual["status"] == "correct"][["player_id", "person_id"]].reset_index(drop=True)
df_ids["person_id"] = df_ids["person_id"].astype(int)

In [18]:
df_ids

Unnamed: 0,player_id,person_id
0,1711,2319
1,2277,2318
2,24489167,4891
3,1145,4890
4,1110,1926
5,2761,1925
6,592,2322
7,2698,2323
8,1128,4193
9,2906,4191


Drop duplicates from the player statistics df_stats:

In [19]:
player_id_to_drop = df_manual["player_id"]
player_id_to_drop

0      18042321
1      18041884
2          1711
3          1711
4          2277
         ...   
152        2638
153        1967
154        1967
155    18039569
156        2230
Name: player_id, Length: 157, dtype: int64

In [20]:
df_stats_filtered = df_stats[~df_stats["player_id"].isin(player_id_to_drop)]
df_stats_filtered

Unnamed: 0,player_id,first_name,last_name,abbreviation,full_name,position,season,ast,blk,dreb,...,pts,reb,stl,turnover,team_id,game_id,date,from_year,to_year,name
0,2551,Edgar,Jones,SAS,San Antonio Spurs,,1984,,,,...,4.0,,,,27,42232,1984-12-14,1980.0,1986.0,Edgar Jones
4,538,Mark,West,DAL,Dallas Mavericks,,1983,1.0,0.0,1.0,...,4.0,3.0,0.0,2.0,7,42212,1984-04-15,1983.0,2000.0,Mark West
6,574,Derek,Harper,DAL,Dallas Mavericks,,1983,4.0,0.0,1.0,...,17.0,2.0,5.0,1.0,7,44349,1984-04-13,1983.0,1999.0,Derek Harper
7,2247,Kareem,Abdul-Jabbar,LAL,Los Angeles Lakers,,1983,0.0,1.0,4.0,...,28.0,7.0,0.0,2.0,14,44213,1984-04-10,1969.0,1989.0,Kareem Abdul-Jabbar
8,574,Derek,Harper,DAL,Dallas Mavericks,,1983,3.0,0.0,1.0,...,0.0,1.0,1.0,1.0,7,39188,1984-04-10,1983.0,1999.0,Derek Harper
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
771247,416,Pascal,Siakam,TOR,Toronto Raptors,F,2022,8.0,0.0,9.0,...,25.0,14.0,0.0,0.0,28,858399,2023-03-16,2016.0,2023.0,Pascal Siakam
771248,413,Collin,Sexton,UTA,Utah Jazz,G,2022,2.0,0.0,3.0,...,22.0,3.0,1.0,3.0,29,857496,2022-11-06,2018.0,2023.0,Collin Sexton
771249,416,Pascal,Siakam,TOR,Toronto Raptors,F,2022,6.0,2.0,3.0,...,28.0,4.0,0.0,2.0,28,858209,2023-02-12,2016.0,2023.0,Pascal Siakam
771250,4197356,Leandro,Bolmaro,UTA,Utah Jazz,F,2022,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,29,858027,2023-01-18,2021.0,2023.0,Leandro Bolmaro


Join the unique player statistics and common information about them:

In [21]:
res1 = pd.merge(df_stats_filtered, df_player_info, on=["name"], how="left")
res1

Unnamed: 0,player_id,first_name,last_name,abbreviation,full_name,position,season,ast,blk,dreb,...,to_year_x,name,person_id,from_year_y,to_year_y,pos,height,weight,birth_date,college
0,2551,Edgar,Jones,SAS,San Antonio Spurs,,1984,,,,...,1986.0,Edgar Jones,,,,,,,NaT,
1,538,Mark,West,DAL,Dallas Mavericks,,1983,1.0,0.0,1.0,...,2000.0,Mark West,4812.0,1984.0,2000.0,C-F,6-10,230.0,1960-11-05,Old Dominion
2,574,Derek,Harper,DAL,Dallas Mavericks,,1983,4.0,0.0,1.0,...,1999.0,Derek Harper,1839.0,1984.0,1999.0,G,6-4,185.0,1961-10-13,Illinois
3,2247,Kareem,Abdul-Jabbar,LAL,Los Angeles Lakers,,1983,0.0,1.0,4.0,...,1989.0,Kareem Abdul-Jabbar,,,,,,,NaT,
4,574,Derek,Harper,DAL,Dallas Mavericks,,1983,3.0,0.0,1.0,...,1999.0,Derek Harper,1839.0,1984.0,1999.0,G,6-4,185.0,1961-10-13,Illinois
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
759288,416,Pascal,Siakam,TOR,Toronto Raptors,F,2022,8.0,0.0,9.0,...,2023.0,Pascal Siakam,4118.0,2017.0,2023.0,F,6-9,230.0,1994-04-02,New Mexico State
759289,413,Collin,Sexton,UTA,Utah Jazz,G,2022,2.0,0.0,3.0,...,2023.0,Collin Sexton,4077.0,2019.0,2023.0,G,6-1,190.0,1999-01-04,Alabama
759290,416,Pascal,Siakam,TOR,Toronto Raptors,F,2022,6.0,2.0,3.0,...,2023.0,Pascal Siakam,4118.0,2017.0,2023.0,F,6-9,230.0,1994-04-02,New Mexico State
759291,4197356,Leandro,Bolmaro,UTA,Utah Jazz,F,2022,1.0,0.0,1.0,...,2023.0,Leandro Bolmaro,414.0,2022.0,2023.0,F,6-6,200.0,2000-09-11,


In [22]:
# Delete one more duplicate
res1 = res1[~((res1["name"] == "Jeff Taylor") & (res1["birth_date"] == "1989-05-23"))]

Join the duplicate player statistics and the ids for both source of data:

In [23]:
res2 = pd.merge(df_ids, df_stats, on=["player_id"], how="left")
res2

Unnamed: 0,player_id,person_id,first_name,last_name,abbreviation,full_name,position,season,ast,blk,...,pts,reb,stl,turnover,team_id,game_id,date,from_year,to_year,name
0,1711,2319,Bobby,Jones,PHI,Philadelphia 76ers,,2006,1.0,0.0,...,1.0,1.0,0.0,0.0,23,18279,2006-11-21,2006.0,2008.0,Bobby Jones
1,1711,2319,Bobby,Jones,PHI,Philadelphia 76ers,,2006,0.0,0.0,...,0.0,0.0,0.0,0.0,23,18846,2006-11-22,2006.0,2008.0,Bobby Jones
2,1711,2319,Bobby,Jones,PHI,Philadelphia 76ers,,2006,1.0,1.0,...,4.0,2.0,0.0,0.0,23,21021,2006-12-31,2006.0,2008.0,Bobby Jones
3,1711,2319,Bobby,Jones,PHI,Philadelphia 76ers,,2006,,,...,,,,,23,18985,2007-02-02,2006.0,2008.0,Bobby Jones
4,1711,2319,Bobby,Jones,PHI,Philadelphia 76ers,,2006,0.0,0.0,...,0.0,0.0,0.0,1.0,23,19788,2007-03-04,2006.0,2008.0,Bobby Jones
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10294,1967,3957,Walker,Russell,DET,Detroit Pistons,,2011,,,...,,,,,9,27114,2012-02-28,2012.0,2012.0,Walker Russell
10295,1967,3957,Walker,Russell,DET,Detroit Pistons,,2011,,,...,,,,,9,29484,2012-04-06,2012.0,2012.0,Walker Russell
10296,1967,3957,Walker,Russell,DET,Detroit Pistons,,2011,2.0,0.0,...,12.0,6.0,0.0,0.0,9,26015,2012-02-01,2012.0,2012.0,Walker Russell
10297,1967,3957,Walker,Russell,DET,Detroit Pistons,,2011,5.0,0.0,...,2.0,2.0,0.0,3.0,9,24932,2012-02-10,2012.0,2012.0,Walker Russell


In [24]:
# res2.drop("name", axis=1, inplace=True)
# res2

Join the duplicate player statistics and common information about them:

In [25]:
df_player_info.drop("name", axis=1, inplace=True)
df_player_info

Unnamed: 0,person_id,from_year,to_year,pos,height,weight,birth_date,college
0,0,1991,1995,F-C,6-10,240.0,1968-06-24,Duke
1,3,1991,2001,G,6-1,162.0,1969-03-09,LSU
2,4,1998,2003,F,6-6,223.0,1974-11-03,"Michigan, San Jose State"
3,5,1997,2008,F,6-9,225.0,1976-12-11,California
4,9,2017,2019,G-F,6-6,200.0,1993-08-01,
...,...,...,...,...,...,...,...,...
3065,5101,2017,2017,C,7-0,240.0,1996-09-09,UNLV
3066,5102,2017,2018,G-F,6-8,215.0,1994-02-18,
3067,5103,2018,2020,F-C,6-10,266.0,1997-01-04,
3068,5104,1983,1983,C,7-1,240.0,1953-12-20,Kent State University


In [26]:
res3 = pd.merge(res2, df_player_info, on=["person_id"], how="left")
res3

Unnamed: 0,player_id,person_id,first_name,last_name,abbreviation,full_name,position,season,ast,blk,...,from_year_x,to_year_x,name,from_year_y,to_year_y,pos,height,weight,birth_date,college
0,1711,2319,Bobby,Jones,PHI,Philadelphia 76ers,,2006,1.0,0.0,...,2006.0,2008.0,Bobby Jones,2007.0,2008.0,F,6-7,215.0,1984-01-09,Washington
1,1711,2319,Bobby,Jones,PHI,Philadelphia 76ers,,2006,0.0,0.0,...,2006.0,2008.0,Bobby Jones,2007.0,2008.0,F,6-7,215.0,1984-01-09,Washington
2,1711,2319,Bobby,Jones,PHI,Philadelphia 76ers,,2006,1.0,1.0,...,2006.0,2008.0,Bobby Jones,2007.0,2008.0,F,6-7,215.0,1984-01-09,Washington
3,1711,2319,Bobby,Jones,PHI,Philadelphia 76ers,,2006,,,...,2006.0,2008.0,Bobby Jones,2007.0,2008.0,F,6-7,215.0,1984-01-09,Washington
4,1711,2319,Bobby,Jones,PHI,Philadelphia 76ers,,2006,0.0,0.0,...,2006.0,2008.0,Bobby Jones,2007.0,2008.0,F,6-7,215.0,1984-01-09,Washington
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10294,1967,3957,Walker,Russell,DET,Detroit Pistons,,2011,,,...,2012.0,2012.0,Walker Russell,2012.0,2012.0,G,6-0,170.0,1982-10-06,Jacksonville State University
10295,1967,3957,Walker,Russell,DET,Detroit Pistons,,2011,,,...,2012.0,2012.0,Walker Russell,2012.0,2012.0,G,6-0,170.0,1982-10-06,Jacksonville State University
10296,1967,3957,Walker,Russell,DET,Detroit Pistons,,2011,2.0,0.0,...,2012.0,2012.0,Walker Russell,2012.0,2012.0,G,6-0,170.0,1982-10-06,Jacksonville State University
10297,1967,3957,Walker,Russell,DET,Detroit Pistons,,2011,5.0,0.0,...,2012.0,2012.0,Walker Russell,2012.0,2012.0,G,6-0,170.0,1982-10-06,Jacksonville State University


Concatinate the data about unique and duplicate players:

In [27]:
result = pd.concat([res1, res3], axis=0, ignore_index=True)
result

Unnamed: 0,player_id,first_name,last_name,abbreviation,full_name,position,season,ast,blk,dreb,...,to_year_x,name,person_id,from_year_y,to_year_y,pos,height,weight,birth_date,college
0,2551,Edgar,Jones,SAS,San Antonio Spurs,,1984,,,,...,1986.0,Edgar Jones,,,,,,,NaT,
1,538,Mark,West,DAL,Dallas Mavericks,,1983,1.0,0.0,1.0,...,2000.0,Mark West,4812.0,1984.0,2000.0,C-F,6-10,230.0,1960-11-05,Old Dominion
2,574,Derek,Harper,DAL,Dallas Mavericks,,1983,4.0,0.0,1.0,...,1999.0,Derek Harper,1839.0,1984.0,1999.0,G,6-4,185.0,1961-10-13,Illinois
3,2247,Kareem,Abdul-Jabbar,LAL,Los Angeles Lakers,,1983,0.0,1.0,4.0,...,1989.0,Kareem Abdul-Jabbar,,,,,,,NaT,
4,574,Derek,Harper,DAL,Dallas Mavericks,,1983,3.0,0.0,1.0,...,1999.0,Derek Harper,1839.0,1984.0,1999.0,G,6-4,185.0,1961-10-13,Illinois
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
769579,1967,Walker,Russell,DET,Detroit Pistons,,2011,,,,...,2012.0,Walker Russell,3957.0,2012.0,2012.0,G,6-0,170.0,1982-10-06,Jacksonville State University
769580,1967,Walker,Russell,DET,Detroit Pistons,,2011,,,,...,2012.0,Walker Russell,3957.0,2012.0,2012.0,G,6-0,170.0,1982-10-06,Jacksonville State University
769581,1967,Walker,Russell,DET,Detroit Pistons,,2011,2.0,0.0,2.0,...,2012.0,Walker Russell,3957.0,2012.0,2012.0,G,6-0,170.0,1982-10-06,Jacksonville State University
769582,1967,Walker,Russell,DET,Detroit Pistons,,2011,5.0,0.0,2.0,...,2012.0,Walker Russell,3957.0,2012.0,2012.0,G,6-0,170.0,1982-10-06,Jacksonville State University


A sample data for one player:

In [28]:
result.iloc[5]

player_id                      2429
first_name                   Dennis
last_name                   Johnson
abbreviation                    BOS
full_name            Boston Celtics
position                       None
season                         1983
ast                             5.0
blk                             0.0
dreb                            2.0
fg3_pct                         NaN
fg3a                            0.0
fg3m                            0.0
fg_pct                        0.333
fga                             6.0
fgm                             2.0
ft_pct                          1.0
fta                             2.0
ftm                             2.0
min                              21
oreb                            0.0
pf                              1.0
pts                             6.0
reb                             2.0
stl                             1.0
turnover                        1.0
team_id                           2
game_id                     


### Visualize data


Create a dataframe that includes data about every game (points and data), player name and birthdate and drop players that do not have birthdate:

In [29]:
df_game_pts_age = result.dropna(subset=["birth_date"])[["name", "date", "birth_date", "pts"]]
df_game_pts_age

Unnamed: 0,name,date,birth_date,pts
1,Mark West,1984-04-15,1960-11-05,4.0
2,Derek Harper,1984-04-13,1961-10-13,17.0
4,Derek Harper,1984-04-10,1961-10-13,0.0
8,Terry Cummings,1984-04-06,1961-03-15,32.0
12,Terry Cummings,1984-04-01,1961-03-15,11.0
...,...,...,...,...
769579,Walker Russell,2012-02-28,1982-10-06,
769580,Walker Russell,2012-04-06,1982-10-06,
769581,Walker Russell,2012-02-01,1982-10-06,12.0
769582,Walker Russell,2012-02-10,1982-10-06,2.0


Replace NaN values with 0 in the pts column:

In [30]:
df_game_pts_age["pts"] = df_game_pts_age["pts"].fillna(0)

Rename the pts column and count the age for every game and every player using birtdates and game dates:

In [31]:
df_game_pts_age.rename(columns={"pts": "Points"}, inplace=True)
# df_game_pts_age["Age"] = df_game_pts_age.apply(lambda row: int(round(((row["date"] - row["birth_date"]).days)/365, 0)), axis=1)
df_game_pts_age["Age"] = round(((df_game_pts_age["date"] - df_game_pts_age["birth_date"]).dt.days / 365), 0).astype(int)
df_game_pts_age

Unnamed: 0,name,date,birth_date,Points,Age
1,Mark West,1984-04-15,1960-11-05,4.0,23
2,Derek Harper,1984-04-13,1961-10-13,17.0,23
4,Derek Harper,1984-04-10,1961-10-13,0.0,23
8,Terry Cummings,1984-04-06,1961-03-15,32.0,23
12,Terry Cummings,1984-04-01,1961-03-15,11.0,23
...,...,...,...,...,...
769579,Walker Russell,2012-02-28,1982-10-06,0.0,29
769580,Walker Russell,2012-04-06,1982-10-06,0.0,30
769581,Walker Russell,2012-02-01,1982-10-06,12.0,29
769582,Walker Russell,2012-02-10,1982-10-06,2.0,29


Group the df_game_pts_age by age:

In [32]:
df_avg_pts_by_age = (
    df_game_pts_age
    .rename(columns={"pts": "Points"})[["Age", "Points"]]
    .groupby("Age")
    .agg({"Points": "mean"})
    .reset_index())
df_avg_pts_by_age

Unnamed: 0,Age,Points
0,18,2.738806
1,19,6.281237
2,20,7.156145
3,21,7.887201
4,22,8.091597
5,23,7.629343
6,24,8.187304
7,25,8.858606
8,26,9.260445
9,27,9.717298


Draw the plots:

In [33]:
# Set the same color for matplotlib that use in seaborn.
deepblue = sns.color_palette("deep")[0]

Plot the average basketball points by age.

In [34]:
df_avg_pts_by_age.plot(
    x="Age", 
    y="Points",
    figsize=(10, 4), 
    marker="o", 
    linestyle="-", 
    color=deepblue,
    legend=False
)
plt.title("Average Basketball Points by Age")
plt.xlabel("Age, years")
plt.ylabel("Average Points")
plt.grid(True)
plt.tight_layout()
plt.savefig(
    "figures/5.1 Average Basketball Points by Age.png", 
    dpi=300, 
    bbox_inches="tight")
plt.close()

<div>
    <img src="figures/5.1 Average Basketball Points by Age.png" alt="Fig. 5.1. Average Basketball Points by Age." style="display: block; margin: 0 auto;">
    <p style="text-align: center;">Fig. 5.1. Average Basketball Points by Age.</p>
</div>

Visualize the average basketball points by age along with confidence intervals:

In [35]:
plt.figure(figsize=(10, 6))
sns.barplot(
    x="Age", 
    y="Points", 
    data=df_game_pts_age, 
    errorbar=("ci", 95), 
    color=deepblue
) 
plt.title("Average Basketball Points by Age with Confidence Intervals")
plt.xlabel("Age, years")
plt.xticks(rotation=0)
plt.ylabel("Average Points")
plt.grid(axis="y")
plt.tight_layout()
plt.savefig(
    "figures/5.2 Average Basketball Points by Age with Confidence Intervals.png", 
    dpi=300, 
    bbox_inches="tight")
plt.close()

<div>
    <img src="figures/5.2 Average Basketball Points by Age with Confidence Intervals.png" alt="Fig. 5.2. Average Basketball Points by Age with Confidence Intervals." style="display: block; margin: 0 auto;">
    <p style="text-align: center;">Fig. 5.2. Average Basketball Points by Age with Confidence Intervals.</p>
</div>

Create the plot: Average basketball points by age group with confidence intervals:

In [36]:
# Create bins
bins = [18, 21, 26, 31, 36, 41, 46]
labels = ["18-20", "21-25", "26-30", "31-35", "36-40", "41-45"]
df_game_pts_age["AgeBin"] = pd.cut(
    df_game_pts_age["Age"], 
    bins=bins, 
    labels=labels,
    right=False
)
# Create binned data
binned_data = df_game_pts_age[["Age", "Points", "AgeBin"]].groupby("AgeBin").mean()
# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x="AgeBin", y="Points", data=df_game_pts_age, errorbar=("ci", 95), palette="deep") 
plt.title("Average Basketball Points by Age Group with Confidence Intervals")
plt.xlabel("Age Group, years")
plt.xticks(rotation=0)
plt.ylabel("Average Points")
plt.grid(True, axis="y")
plt.tight_layout()
plt.savefig(
    "figures/5.3 Average Basketball Points by Age Group with Confidence Intervals.png", 
    dpi=300, 
    bbox_inches="tight")
plt.close()


<div>
    <img src="figures/5.3 Average Basketball Points by Age Group with Confidence Intervals.png" alt="Fig. 5.3. Average Basketball Points by Age Group with Confidence Intervals." style="display: block; margin: 0 auto;">
    <p style="text-align: center;">Fig. 5.3. Average Basketball Points by Age Group with Confidence Intervals.</p>
</div>

Creatу a visualization to display the distribution of basketball points across different age groups using a box plot:

In [37]:
plt.figure(figsize=(12, 6))
sns.boxplot(x="AgeBin", y="Points", data=df_game_pts_age, palette="deep")
plt.title("Box Plot of Points by Age Group")
plt.xlabel("Age Group, years")
plt.grid(True, axis="y")
plt.savefig(
    "figures/5.4 Box Plot of Points by Age Group.png", 
    dpi=300, 
    bbox_inches="tight")
plt.close()


<div>
    <img src="figures/5.4 Box Plot of Points by Age Group.png" alt="Fig. 5.4. Box Plot of Points by Age Group." style="display: block; margin: 0 auto;">
    <p style="text-align: center;">Fig. 5.4. Box Plot of Points by Age Group.</p>
</div>

Create a visualization to display the distribution and density of basketball points across different age groups using a violin plot:

In [38]:
plt.figure(figsize=(12, 6))
sns.violinplot(x="AgeBin", y="Points", data=df_game_pts_age, inner="box", palette="deep")
plt.title("Violin Plot of Points by Age Group")
plt.xlabel("Age Group, years")
plt.grid(True, axis="y")
plt.savefig(
    "figures/5.5 Violin Plot of Points by Age Group.png", 
    dpi=300, 
    bbox_inches="tight")
plt.close()

<div>
    <img src="figures/5.5 Violin Plot of Points by Age Group.png" alt="Fig. 5.5. Violin Plot of Points by Age Group." style="display: block; margin: 0 auto;">
    <p style="text-align: center;">Fig. 5.5. Violin Plot of Points by Age Group.</p>
</div>

In [39]:
df_game_pts_age.drop("AgeBin", axis=1, inplace=True)

T-test for some groups:

In [40]:
group1 = df_game_pts_age[df_game_pts_age["Age"] == 27]["Points"]
group1

3744       8.0
3758      12.0
3798      28.0
3816       9.0
3842      28.0
          ... 
769376     0.0
769378     0.0
769384     3.0
769389     0.0
769397     0.0
Name: Points, Length: 56059, dtype: float64

In [41]:
# group2 = df_game_pts_age[df_game_pts_age["Age"] == 28]["Points"]
group2 = df_game_pts_age[df_game_pts_age["Age"] == 32]["Points"]
group2

5538       0.0
5565       0.0
12739      4.0
12800     19.0
12836     14.0
          ... 
769220     0.0
769221     4.0
769222    11.0
769223     4.0
769224     0.0
Name: Points, Length: 29612, dtype: float64

In [42]:
t_stat, p_value = ttest_ind(group1, group2)
t_stat, p_value

(28.015472361098844, 6.314107470150504e-172)

In [43]:
group2 = df_game_pts_age[df_game_pts_age["Age"] == 28]["Points"]

In [44]:
t_stat, p_value = ttest_ind(group1, group2)
t_stat, p_value

(0.08273017007016818, 0.9340662496439216)

Describe the result of t-test briefly.

Generate a scatter plot to visualize the relationship between basketball points and the age of the players:

In [45]:
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df_game_pts_age, x="Age", y="Points", alpha=0.5, edgecolor=None, s=5)
plt.title("Scatter plot of Points by Age")
plt.xlabel("Age, years")
plt.ylabel("Points")
plt.grid(True)
plt.tight_layout()
plt.savefig(
    "figures/5.6 Scatter plot of Points by Age.png", 
    dpi=300, 
    bbox_inches="tight")
plt.close()

<div>
    <img src="figures/5.6 Scatter plot of Points by Age.png" alt="Fig. 5.6. Scatter plot of Points by Age." style="display: block; margin: 0 auto;">
    <p style="text-align: center;">Fig. 5.6. Scatter plot of Points by Age.</p>
</div>

Players that score more than 60 points:

In [46]:
df_game_pts_age[df_game_pts_age["Points"] > 60]

Unnamed: 0,name,date,birth_date,Points,Age
52985,Kobe Bryant,2005-12-20,1978-08-23,62.0,27
55301,Kobe Bryant,2006-01-22,1978-08-23,81.0,27
64364,Kobe Bryant,2009-02-02,1978-08-23,61.0,30
81241,James Harden,2019-01-23,1989-08-26,61.0,29
81557,James Harden,2019-03-22,1989-08-26,61.0,30
145993,Tracy McGrady,2004-03-10,1979-05-24,62.0,25
217365,David Robinson,1994-04-24,1965-08-06,71.0,29
356854,Stephen Curry,2021-01-03,1988-03-14,62.0,33
424641,Damian Lillard,2020-08-11,1990-07-15,61.0,30
424686,Damian Lillard,2020-01-20,1990-07-15,61.0,30


Create a heatmap of the average season points scored by basketball players who have played for more than 15 years:

In [47]:
df_hm = result.dropna(subset=["birth_date"])[["player_id", "name", "date", "birth_date", "pts", "from_year_x", "to_year_x"]]
df_hm

Unnamed: 0,player_id,name,date,birth_date,pts,from_year_x,to_year_x
1,538,Mark West,1984-04-15,1960-11-05,4.0,1983.0,2000.0
2,574,Derek Harper,1984-04-13,1961-10-13,17.0,1983.0,1999.0
4,574,Derek Harper,1984-04-10,1961-10-13,0.0,1983.0,1999.0
8,2987,Terry Cummings,1984-04-06,1961-03-15,32.0,1982.0,2000.0
12,2987,Terry Cummings,1984-04-01,1961-03-15,11.0,1982.0,2000.0
...,...,...,...,...,...,...,...
769579,1967,Walker Russell,2012-02-28,1982-10-06,,2012.0,2012.0
769580,1967,Walker Russell,2012-04-06,1982-10-06,,2012.0,2012.0
769581,1967,Walker Russell,2012-02-01,1982-10-06,12.0,2012.0,2012.0
769582,1967,Walker Russell,2012-02-10,1982-10-06,2.0,2012.0,2012.0


In [48]:
df_hm["pts"] = df_hm["pts"].fillna(0)
df_hm

Unnamed: 0,player_id,name,date,birth_date,pts,from_year_x,to_year_x
1,538,Mark West,1984-04-15,1960-11-05,4.0,1983.0,2000.0
2,574,Derek Harper,1984-04-13,1961-10-13,17.0,1983.0,1999.0
4,574,Derek Harper,1984-04-10,1961-10-13,0.0,1983.0,1999.0
8,2987,Terry Cummings,1984-04-06,1961-03-15,32.0,1982.0,2000.0
12,2987,Terry Cummings,1984-04-01,1961-03-15,11.0,1982.0,2000.0
...,...,...,...,...,...,...,...
769579,1967,Walker Russell,2012-02-28,1982-10-06,0.0,2012.0,2012.0
769580,1967,Walker Russell,2012-04-06,1982-10-06,0.0,2012.0,2012.0
769581,1967,Walker Russell,2012-02-01,1982-10-06,12.0,2012.0,2012.0
769582,1967,Walker Russell,2012-02-10,1982-10-06,2.0,2012.0,2012.0


In [49]:
# df_game_pts_age.apply(lambda row: int(round(((row["date"] - row["birth_date"]).days)/365, 0)), axis=1)

df_hm["Age"] = round(((df_hm["date"] - df_hm["birth_date"]).dt.days / 365), 0).astype(int)
df_hm

Unnamed: 0,player_id,name,date,birth_date,pts,from_year_x,to_year_x,Age
1,538,Mark West,1984-04-15,1960-11-05,4.0,1983.0,2000.0,23
2,574,Derek Harper,1984-04-13,1961-10-13,17.0,1983.0,1999.0,23
4,574,Derek Harper,1984-04-10,1961-10-13,0.0,1983.0,1999.0,23
8,2987,Terry Cummings,1984-04-06,1961-03-15,32.0,1982.0,2000.0,23
12,2987,Terry Cummings,1984-04-01,1961-03-15,11.0,1982.0,2000.0,23
...,...,...,...,...,...,...,...,...
769579,1967,Walker Russell,2012-02-28,1982-10-06,0.0,2012.0,2012.0,29
769580,1967,Walker Russell,2012-04-06,1982-10-06,0.0,2012.0,2012.0,30
769581,1967,Walker Russell,2012-02-01,1982-10-06,12.0,2012.0,2012.0,29
769582,1967,Walker Russell,2012-02-10,1982-10-06,2.0,2012.0,2012.0,29


In [50]:
df_hm["Exp"] = df_hm["to_year_x"] - df_hm["from_year_x"].astype(int)
df_hm

Unnamed: 0,player_id,name,date,birth_date,pts,from_year_x,to_year_x,Age,Exp
1,538,Mark West,1984-04-15,1960-11-05,4.0,1983.0,2000.0,23,17.0
2,574,Derek Harper,1984-04-13,1961-10-13,17.0,1983.0,1999.0,23,16.0
4,574,Derek Harper,1984-04-10,1961-10-13,0.0,1983.0,1999.0,23,16.0
8,2987,Terry Cummings,1984-04-06,1961-03-15,32.0,1982.0,2000.0,23,18.0
12,2987,Terry Cummings,1984-04-01,1961-03-15,11.0,1982.0,2000.0,23,18.0
...,...,...,...,...,...,...,...,...,...
769579,1967,Walker Russell,2012-02-28,1982-10-06,0.0,2012.0,2012.0,29,0.0
769580,1967,Walker Russell,2012-04-06,1982-10-06,0.0,2012.0,2012.0,30,0.0
769581,1967,Walker Russell,2012-02-01,1982-10-06,12.0,2012.0,2012.0,29,0.0
769582,1967,Walker Russell,2012-02-10,1982-10-06,2.0,2012.0,2012.0,29,0.0


Player with more than 15 year experince:

In [51]:
df_hm = df_hm[df_hm["Exp"] > 15]

In [52]:
df_hm.reset_index(drop=True, inplace=True)
df_hm

Unnamed: 0,player_id,name,date,birth_date,pts,from_year_x,to_year_x,Age,Exp
0,538,Mark West,1984-04-15,1960-11-05,4.0,1983.0,2000.0,23,17.0
1,574,Derek Harper,1984-04-13,1961-10-13,17.0,1983.0,1999.0,23,16.0
2,574,Derek Harper,1984-04-10,1961-10-13,0.0,1983.0,1999.0,23,16.0
3,2987,Terry Cummings,1984-04-06,1961-03-15,32.0,1982.0,2000.0,23,18.0
4,2987,Terry Cummings,1984-04-01,1961-03-15,11.0,1982.0,2000.0,23,18.0
...,...,...,...,...,...,...,...,...,...
94961,2964,Patrick Ewing,2002-01-09,1962-08-05,0.0,1985.0,2002.0,39,17.0
94962,2964,Patrick Ewing,2001-11-21,1962-08-05,0.0,1985.0,2002.0,39,17.0
94963,2964,Patrick Ewing,2001-12-13,1962-08-05,14.0,1985.0,2002.0,39,17.0
94964,2964,Patrick Ewing,2002-04-12,1962-08-05,8.0,1985.0,2002.0,40,17.0


In [53]:
df_hm = df_hm[["name", "Age", "pts" ]].groupby(["name", "Age"]).agg({"pts": "mean"}).reset_index()
df_hm

Unnamed: 0,name,Age,pts
0,A.C. Green,22,6.893617
1,A.C. Green,23,9.034483
2,A.C. Green,24,12.015625
3,A.C. Green,25,11.594203
4,A.C. Green,26,12.869565
...,...,...,...
1853,Zaza Pachulia,31,7.457627
1854,Zaza Pachulia,32,8.220339
1855,Zaza Pachulia,33,6.413043
1856,Zaza Pachulia,34,4.034884


In [54]:
df_hm = df_hm.pivot(index="name", columns="Age", values="pts").fillna(0)

In [55]:
plt.figure(figsize=(16, 20))
sns.heatmap(df_hm, cmap="coolwarm", annot=True)
plt.title("Average Season Points Heatmap by Age for Careers Over 15 Years")
plt.xlabel("Age, years")
plt.ylabel("Player name")
plt.tight_layout()
plt.savefig(
    "figures/5.7 Average Season Points Heatmap by Age for Careers Over 15 Years.png", 
    dpi=600, 
    bbox_inches="tight"
)
plt.close()

<div>
    <img src="figures/5.7 Average Season Points Heatmap by Age for Careers Over 15 Years.png" alt="Fig. 5.7. Average Season Points Heatmap by Age for Careers Over 15 Years." style="display: block; margin: 0 auto;">
    <p style="text-align: center;">Fig. 5.7. Average Season Points Heatmap by Age for Careers Over 15 Years.</p>
</div>

### Conclusions

Conclusions based on Fig. 5.1-5.7:

- The average points scored tend to increase from ages 18 to 27-29.
- Ages 27-29 represent the prime for most players.
- The average points scored tend to decrease from ages 30 to 45.
- The 26-30 age range is the most productive period for most players.
- Great athletes with long careers exhibit a similar tendency.

### Simple Linear Regression

We can use regression to predict the points scored by age. However, this might be a somewhat naive approach. It's more straightforward to refer to Fig. 5.1 or 5.2 and determine the average points for any age. Also, given the relatively small amount of examples, it might not be ideal for machine learning. Nevertheless, we can apply linear and polynomial regression for visualization purposes only.

In [56]:
# Prepare the data.
x = df_avg_pts_by_age["Age"].values.reshape(-1, 1)
y = df_avg_pts_by_age["Points"]
# Create and fit the model.
model = LinearRegression()
model.fit(x, y)
# Add the predicted column to df.
df_avg_pts_by_age["Predicted"] = model.predict(x)

Plot the data and regression line:

In [125]:
plt.figure(figsize=(10, 6))
plt.scatter(df_avg_pts_by_age["Age"], df_avg_pts_by_age["Points"], label="Actual Points", color="blue")
plt.plot(df_avg_pts_by_age["Age"], df_avg_pts_by_age["Predicted"], label="Regression Line", color="red")
plt.title("Average Basketball Points by Age with Regression Line")
plt.xlabel("Age, years")
plt.ylabel("Average Points")
plt.legend()
plt.grid(True)
plt.savefig(
    "figures/5.8 Average Basketball Points by Age with Regression Line.png", 
    dpi=600, 
    bbox_inches="tight")
plt.close()

<div>
    <img src="figures/5.8 Average Basketball Points by Age with Regression Line.png" alt="Fig. 5.8. Average Basketball Points by Age with Regression Line." style="display: block; margin: 0 auto;">
    <p style="text-align: center;">Fig. 5.8. Average Basketball Points by Age with Regression Line.</p>
</div>

From the visualization, we can see that the regression line provides a general trend of the average points scored by players as they age. But we have a non-linear relationship between points and age, so linear regression works poorly.

### Polynomial regression

Define the polynomial regression model (2nd degree polynomial)

Create the model and  plot the data and regression curves:

In [126]:
# Prepare the data.
x = df_avg_pts_by_age["Age"].values.reshape(-1, 1)
y = df_avg_pts_by_age["Points"]
# Prepare fig
plt.figure(figsize=(10, 6))
# Create and fit the model using different degrees and colors.
deg_clrs = [(2, "red"), (5, "orange"), (7, "green")]
for deg_clr in deg_clrs:
    model = make_pipeline(PolynomialFeatures(deg_clr[0]), LinearRegression())
    model.fit(x, y)
    # Add the predicted column to df.
    df_avg_pts_by_age[f"{deg_clr[0]}_degree"] = model.predict(x)
    plt.plot(
        df_avg_pts_by_age["Age"], 
        df_avg_pts_by_age[f"{deg_clr[0]}_degree"], 
        color=deg_clr[1], 
        label=f'{deg_clr[0]} Degree Polynomial Regression'
    ) 
plt.scatter(df_avg_pts_by_age["Age"], df_avg_pts_by_age["Points"], color="blue", label="Actual Data")      
plt.title("Average Points by Age with Polynomial Regression Fit")
plt.xlabel("Age, years")
plt.ylabel("Average Points")
plt.legend()
plt.grid(True)
plt.savefig(
    "figures/5.9 Average Points by Age with Polynomial Regression Fit.png", 
    dpi=600, 
    bbox_inches="tight"
)
plt.close()

<div>
    <img src="figures/5.9 Average Points by Age with Polynomial Regression Fit.png" alt="Fig. 5.9. Average Points by Age with Polynomial Regression Fit." style="display: block; margin: 0 auto;">
    <p style="text-align: center;">Fig. 5.9. Average Points by Age with Polynomial Regression Fit.</p>
</div>

From this visualization, we can see that the polynomial regression fits much better than simple regression. At the same time, we can see overfitting when the polynomial degree is too high.

In [127]:
df_game_pts_age

Unnamed: 0,name,date,birth_date,Points,Age
1,Mark West,1984-04-15,1960-11-05,4.0,23
2,Derek Harper,1984-04-13,1961-10-13,17.0,23
4,Derek Harper,1984-04-10,1961-10-13,0.0,23
8,Terry Cummings,1984-04-06,1961-03-15,32.0,23
12,Terry Cummings,1984-04-01,1961-03-15,11.0,23
...,...,...,...,...,...
769579,Walker Russell,2012-02-28,1982-10-06,0.0,29
769580,Walker Russell,2012-04-06,1982-10-06,0.0,30
769581,Walker Russell,2012-02-01,1982-10-06,12.0,29
769582,Walker Russell,2012-02-10,1982-10-06,2.0,29


In [129]:
pd.set_option("display.max_columns", None)
result

Unnamed: 0,player_id,first_name,last_name,abbreviation,full_name,position,season,ast,blk,dreb,fg3_pct,fg3a,fg3m,fg_pct,fga,fgm,ft_pct,fta,ftm,min,oreb,pf,pts,reb,stl,turnover,team_id,game_id,date,from_year_x,to_year_x,name,person_id,from_year_y,to_year_y,pos,height,weight,birth_date,college
0,2551,Edgar,Jones,SAS,San Antonio Spurs,,1984,,,,,,,0.500,4.0,2.0,0.000,1.0,0.0,0,,,4.0,,,,27,42232,1984-12-14,1980.0,1986.0,Edgar Jones,,,,,,,NaT,
1,538,Mark,West,DAL,Dallas Mavericks,,1983,1.0,0.0,1.0,,0.0,0.0,0.333,6.0,2.0,,0.0,0.0,19,2.0,5.0,4.0,3.0,0.0,2.0,7,42212,1984-04-15,1983.0,2000.0,Mark West,4812.0,1984.0,2000.0,C-F,6-10,230.0,1960-11-05,Old Dominion
2,574,Derek,Harper,DAL,Dallas Mavericks,,1983,4.0,0.0,1.0,,0.0,0.0,0.700,10.0,7.0,0.500,6.0,3.0,30,1.0,0.0,17.0,2.0,5.0,1.0,7,44349,1984-04-13,1983.0,1999.0,Derek Harper,1839.0,1984.0,1999.0,G,6-4,185.0,1961-10-13,Illinois
3,2247,Kareem,Abdul-Jabbar,LAL,Los Angeles Lakers,,1983,0.0,1.0,4.0,,0.0,0.0,0.611,18.0,11.0,0.750,8.0,6.0,35,3.0,3.0,28.0,7.0,0.0,2.0,14,44213,1984-04-10,1969.0,1989.0,Kareem Abdul-Jabbar,,,,,,,NaT,
4,574,Derek,Harper,DAL,Dallas Mavericks,,1983,3.0,0.0,1.0,,0.0,0.0,0.000,1.0,0.0,,0.0,0.0,18,0.0,0.0,0.0,1.0,1.0,1.0,7,39188,1984-04-10,1983.0,1999.0,Derek Harper,1839.0,1984.0,1999.0,G,6-4,185.0,1961-10-13,Illinois
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
769579,1967,Walker,Russell,DET,Detroit Pistons,,2011,,,,,,,,,,,,,,,,,,,,9,27114,2012-02-28,2012.0,2012.0,Walker Russell,3957.0,2012.0,2012.0,G,6-0,170.0,1982-10-06,Jacksonville State University
769580,1967,Walker,Russell,DET,Detroit Pistons,,2011,,,,,,,,,,,,,,,,,,,,9,29484,2012-04-06,2012.0,2012.0,Walker Russell,3957.0,2012.0,2012.0,G,6-0,170.0,1982-10-06,Jacksonville State University
769581,1967,Walker,Russell,DET,Detroit Pistons,,2011,2.0,0.0,2.0,0.667,3.0,2.0,0.625,8.0,5.0,,0.0,0.0,21,4.0,3.0,12.0,6.0,0.0,0.0,9,26015,2012-02-01,2012.0,2012.0,Walker Russell,3957.0,2012.0,2012.0,G,6-0,170.0,1982-10-06,Jacksonville State University
769582,1967,Walker,Russell,DET,Detroit Pistons,,2011,5.0,0.0,2.0,0.000,1.0,0.0,0.200,5.0,1.0,,0.0,0.0,26,0.0,2.0,2.0,2.0,0.0,3.0,9,24932,2012-02-10,2012.0,2012.0,Walker Russell,3957.0,2012.0,2012.0,G,6-0,170.0,1982-10-06,Jacksonville State University


Save the result into the result.csv.

In [130]:
result.to_csv("csv/result.csv")