In [1]:
import pandas as pd
import glob

# Get all CSV files in the current Colab directory
csv_files = sorted(glob.glob("*.csv"))

# Combine all CSVs into one DataFrame
all_data = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)

# Save the merged DataFrame as a new CSV
all_data.to_csv("players_2005_2025.csv", index=False)

print("✅ Combined CSV saved as players_2005_2025.csv")
print("Shape:", all_data.shape)



✅ Combined CSV saved as players_2005_2025.csv
Shape: (13313, 52)


In [3]:
import pandas as pd

# Load CSV
players = pd.read_csv("players_2005_2025.csv")

# --- 1. Basic Info ---
print(players.shape)
print(players.info())

(13313, 52)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13313 entries, 0 to 13312
Data columns (total 52 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  13313 non-null  object 
 1   Age     13292 non-null  float64
 2   Team    13292 non-null  object 
 3   Pos     13292 non-null  object 
 4   G       13292 non-null  float64
 5   GS      13292 non-null  float64
 6   MP      13292 non-null  float64
 7   PER     13287 non-null  float64
 8   TS%     13233 non-null  float64
 9   3PAr    13230 non-null  float64
 10  FTr     13230 non-null  float64
 11  ORB%    13308 non-null  float64
 12  DRB%    13308 non-null  float64
 13  TRB%    13308 non-null  float64
 14  AST%    13308 non-null  float64
 15  STL%    13308 non-null  float64
 16  BLK%    13308 non-null  float64
 17  TOV%    13244 non-null  float64
 18  USG%    13308 non-null  float64
 19  OWS     13292 non-null  float64
 20  DWS     13292 non-null  float64
 21  WS      13292 non-null 

In [4]:
# --- 2. Missing values ---
print(players.isnull().sum().sort_values(ascending=False))

Awards    12024
3P%        1423
FT%         493
2P%          94
3PAr         83
FTr          83
TS%          80
TOV%         69
eFG%         45
FG%          45
PER          26
BPM          26
OBPM         26
DBPM         26
WS/48        26
Pos          21
OWS          21
MP           21
Team         21
Age          21
2P           21
3PA          21
3P           21
2PA          21
FT           21
FTA          21
ORB          21
DRB          21
GS           21
G            21
VORP         21
MPG          21
FGA          21
DWS          21
FG           21
WS           21
TRB          21
AST          21
STL          21
BLK          21
TOV          21
PF           21
PTS          21
STL%          5
TRB%          5
USG%          5
AST%          5
BLK%          5
ORB%          5
DRB%          5
Player        0
Season        0
dtype: int64


In [5]:
# --- 3. Data types summary ---
print(players.dtypes.value_counts())

float64    47
object      4
int64       1
Name: count, dtype: int64


In [6]:
# --- 4. Basic descriptive statistics ---
print(players.describe())

                Age             G            GS            MP           PER  \
count  13292.000000  13292.000000  13292.000000  13292.000000  13287.000000   
mean      26.410322     44.417168     20.687857   1016.826738     12.551637   
std        4.193948     25.847029     26.380506    843.289307      6.638989   
min       18.000000      1.000000      0.000000      0.000000    -54.400000   
25%       23.000000     21.000000      0.000000    251.000000      9.500000   
50%       26.000000     47.000000      6.000000    824.000000     12.600000   
75%       29.000000     68.000000     36.000000   1673.000000     15.900000   
max       44.000000     84.000000     83.000000   3424.000000    133.800000   

                TS%          3PAr          FTr          ORB%          DRB%  \
count  13233.000000  13230.000000  13230.00000  13308.000000  13308.000000   
mean       0.522514      0.302623      0.27830      5.396356     14.673843   
std        0.106789      0.230277      0.23186      5.

In [8]:
# --- 5. Duplicate check ---
duplicates = players.duplicated(subset=['Player', 'Season','Team']).sum()
print(f"Duplicate Player-Season entries: {duplicates}")

Duplicate Player-Season entries: 0


In [3]:
import pandas as pd

# Load CSV
players = pd.read_csv("players_2005_2025.csv")

# --- 1. Basic Info ---
print(players.shape)
print(players.info())

# --- 2. Missing values ---
print(players.isnull().sum().sort_values(ascending=False))

# --- 3. Data types summary ---
print(players.dtypes.value_counts())

# --- 4. Basic descriptive statistics ---
print(players.describe())

# --- 5. Duplicate check ---
duplicates = players.duplicated(subset=['Player', 'Season']).sum()
print(f"Duplicate Player-Season entries: {duplicates}")

# --- 6. Range checks (e.g. FG%, TS%) ---
print(players[['FG%', 'TS%']].min())
print(players[['FG%', 'TS%']].max())

# --- 7. Outlier check (e.g., PER unrealistic values) ---
print(players[players['PER'] > 40])  # should be none ideally


(13313, 52)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13313 entries, 0 to 13312
Data columns (total 52 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  13313 non-null  object 
 1   Age     13292 non-null  float64
 2   Team    13292 non-null  object 
 3   Pos     13292 non-null  object 
 4   G       13292 non-null  float64
 5   GS      13292 non-null  float64
 6   MP      13292 non-null  float64
 7   PER     13287 non-null  float64
 8   TS%     13233 non-null  float64
 9   3PAr    13230 non-null  float64
 10  FTr     13230 non-null  float64
 11  ORB%    13308 non-null  float64
 12  DRB%    13308 non-null  float64
 13  TRB%    13308 non-null  float64
 14  AST%    13308 non-null  float64
 15  STL%    13308 non-null  float64
 16  BLK%    13308 non-null  float64
 17  TOV%    13244 non-null  float64
 18  USG%    13308 non-null  float64
 19  OWS     13292 non-null  float64
 20  DWS     13292 non-null  float64
 21  WS      13292 non-null 

In [2]:
import pandas as pd
import glob
import json

# Get all JSON files in the current Colab directory
json_files = sorted(glob.glob("*.json"))

# Combine all JSONs into one DataFrame
dfs = []
for file in json_files:
    with open(file, 'r') as f:
        data = json.load(f)
        df = pd.DataFrame(data)
        dfs.append(df)

all_json = pd.concat(dfs, ignore_index=True)

# Save combined file
all_json.to_json("teams_2005_2025.json", index=False)

print("✅ Combined JSON saved as teams_2005_2025.json")
print("Shape:", all_json.shape)


✅ Combined JSON saved as teams_2005_2025.json
Shape: (626, 54)


In [9]:
import json
import pandas as pd

# --- 1. Load JSON ---
teams = pd.read_json("teams_2005_2025.json")

# --- 2. Basic structure ---
print(teams.shape)
print(teams.info())

# --- 3. View column names and sample ---
print(teams.columns.tolist())
print(teams.head())

# --- 4. Check for missing values ---
print(teams.isnull().sum())

# --- 5. Basic stats for numerical variables ---
print(teams.describe())

# --- 6. Duplicate check ---
duplicates = teams.duplicated(subset=['Team', 'Season']).sum()
print(f"Duplicate Team-Season entries: {duplicates}")

# --- 7. Range and logic checks ---
print(teams[['ORtg', 'DRtg', 'NRtg', 'W', 'L']].min())
print(teams[['ORtg', 'DRtg', 'NRtg', 'W', 'L']].max())

# --- 8. Attendance validation ---
print(teams['Attend./G'].describe())


(626, 54)
<class 'pandas.core.frame.DataFrame'>
Index: 626 entries, 0 to 625
Data columns (total 54 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Team          626 non-null    object 
 1   Conference    626 non-null    object 
 2   Seed          626 non-null    int64  
 3   Championship  626 non-null    int64  
 4   Season        626 non-null    int64  
 5   G             626 non-null    int64  
 6   MP            626 non-null    float64
 7   FG            626 non-null    float64
 8   FGA           626 non-null    float64
 9   FG%           626 non-null    float64
 10  3P            626 non-null    float64
 11  3PA           626 non-null    float64
 12  3P%           626 non-null    float64
 13  2P            626 non-null    float64
 14  2PA           626 non-null    float64
 15  2P%           626 non-null    float64
 16  FT            626 non-null    float64
 17  FTA           626 non-null    float64
 18  FT%           626 non-nul