In [22]:
import kagglehub
import pandas as pd
import numpy as np
from scipy import stats
import os



In [23]:
path = kagglehub.dataset_download("jacobbaruch/basketball-players-stats-per-season-49-leagues")

print("Dataset path:", path)
print(os.listdir(path))

Dataset path: C:\Users\jeroa\.cache\kagglehub\datasets\jacobbaruch\basketball-players-stats-per-season-49-leagues\versions\10
['players_stats_by_season_full_details.csv']


In [24]:
file = os.path.join(path, os.listdir(path)[0])

df = pd.read_csv(file)

df.head()

Unnamed: 0,League,Season,Stage,Player,Team,GP,MIN,FGM,FGA,3PM,...,birth_date,height,height_cm,weight,weight_kg,nationality,high_school,draft_round,draft_pick,draft_team
0,NBA,1999 - 2000,Regular_Season,Shaquille O'Neal,LAL,79,3163.0,956,1665,0,...,"Mar 6, 1972",7-1,216.0,325.0,147.0,United States,Robert G. Cole High School,1.0,1.0,Orlando Magic
1,NBA,1999 - 2000,Regular_Season,Vince Carter,TOR,82,3126.0,788,1696,95,...,"Jan 26, 1977",6-6,198.0,220.0,100.0,United States,Mainland High School,1.0,5.0,Golden State Warriors
2,NBA,1999 - 2000,Regular_Season,Karl Malone,UTA,82,2947.0,752,1476,2,...,"Jul 24, 1963",6-9,206.0,265.0,120.0,United States,Summerfield High School,1.0,13.0,Utah Jazz
3,NBA,1999 - 2000,Regular_Season,Allen Iverson,PHI,70,2853.0,729,1733,89,...,"Jun 7, 1975",6-0,183.0,165.0,75.0,United States,Bethel High School,1.0,1.0,Philadelphia Sixers
4,NBA,1999 - 2000,Regular_Season,Gary Payton,SEA,82,3425.0,747,1666,177,...,"Jul 23, 1968",6-4,193.0,180.0,82.0,United States,Skyline High School,1.0,2.0,Seattle SuperSonics


In [25]:
player_seasons = df.groupby("Player")["Season"].nunique()

most_seasons_player = player_seasons.idxmax()

print("Player with most seasons:", most_seasons_player)

Player with most seasons: Vince Carter


In [26]:
player_data = df[df["Player"] == most_seasons_player]

player_data = player_data.sort_values("Season")

player_data["three_pt_accuracy"] = player_data["3PM"] / player_data["3PA"]

player_data[["Season","3PM","3PA","three_pt_accuracy"]]

Unnamed: 0,Season,3PM,3PA,three_pt_accuracy
1,1999 - 2000,95,236,0.402542
278,1999 - 2000,1,10,0.1
509,2000 - 2001,162,397,0.40806
702,2000 - 2001,25,61,0.409836
1047,2001 - 2002,121,313,0.386581
2761,2003 - 2004,93,243,0.382716
4114,2004 - 2005,127,313,0.405751
4371,2004 - 2005,6,19,0.315789
5333,2005 - 2006,125,367,0.340599
5535,2005 - 2006,14,58,0.241379


In [27]:
# Convert seasons to numeric index
years = np.arange(len(player_data))

accuracy = player_data["three_pt_accuracy"].values

slope, intercept, r, p, std_err = stats.linregress(years, accuracy)

print("Slope:", slope)
print("Intercept:", intercept)

Slope: 0.002104635691907271
Intercept: 0.3312864825228854


In [28]:
# Predicted values from regression line
predicted = slope * years + intercept

# Integrate area under curve
area = np.trapezoid(predicted, years)

# Average accuracy from regression
avg_fit_accuracy = area / (years.max() - years.min())

# Actual average accuracy
actual_avg = accuracy.mean()

print("Average from regression:", avg_fit_accuracy)
print("Actual average:", actual_avg)

Average from regression: 0.36180370005554074
Actual average: 0.3618037000555408


In [31]:
# Add missing seasons
player_data.loc["2002-2003"] = np.nan
player_data.loc["2015-2016"] = np.nan

# Sort seasons so interpolation works correctly
player_data = player_data.sort_index()

# Interpolate the 3-point accuracy
player_data["three_pt_accuracy"] = player_data["three_pt_accuracy"].interpolate()

# Show estimated values
print("2002-2003 estimate:")
print(player_data.loc["2002-2003", "three_pt_accuracy"])

print("2015-2016 estimate:")
print(player_data.loc["2015-2016", "three_pt_accuracy"])

2002-2003 estimate:
0.38464875951563915
2015-2016 estimate:
0.5391891891891891


In [32]:
# Select columns
fgm = nba["FGM"]
fga = nba["FGA"]

# FGM statistics
print("FGM Statistics")
print("Mean:", np.mean(fgm))
print("Variance:", np.var(fgm))
print("Skew:", stats.skew(fgm))
print("Kurtosis:", stats.kurtosis(fgm))

print()

# FGA statistics
print("FGA Statistics")
print("Mean:", np.mean(fga))
print("Variance:", np.var(fga))
print("Skew:", stats.skew(fga))
print("Kurtosis:", stats.kurtosis(fga))

FGM Statistics
Mean: 205.75357002489193
Variance: 33910.08141821274
Skew: 0.9112095540871205
Kurtosis: 0.14840202159023086

FGA Statistics
Mean: 450.0568583780951
Variance: 157384.39084546792
Skew: 0.8749440546145822
Kurtosis: 0.060890495009315426


In [33]:
paired_test = stats.ttest_rel(fgm, fga)
independent_test = stats.ttest_ind(fgm, fga)

print("Relational (paired) t-test:", paired_test)
print("Regular (independent) t-test:", independent_test)

Relational (paired) t-test: TtestResult(statistic=np.float64(-98.76011434943148), pvalue=np.float64(0.0), df=np.int64(7632))
Regular (independent) t-test: TtestResult(statistic=np.float64(-48.79746746778874), pvalue=np.float64(0.0), df=np.float64(15264.0))
