In [1]:
import json
import pandas as pd

with open('src/data/intern_project_data.json', 'r') as f:
    data = json.load(f)

# Convert each section to a DataFrame
bio_df = pd.DataFrame(data['bio'])
game_logs_df = pd.DataFrame(data['game_logs'])
season_logs_df = pd.DataFrame(data['seasonLogs'])
measurements_df = pd.DataFrame(data['measurements'])
scouting_df = pd.DataFrame(data['scoutingReports'])

In [8]:
# Basic structure
bio_df.head()

Unnamed: 0,name,playerId,firstName,lastName,birthDate,height,weight,highSchool,highSchoolState,homeTown,homeState,homeCountry,nationality,photoUrl,currentTeam,league,leagueType
0,Cooper Flagg,170531,Cooper,Flagg,2006-12-21,81,209,Montverde,FL,New Portland,ME,USA,USA,https://www.draftexpress.com/blue/images/heads...,Duke,NCAA D-I,NCAA
1,Dylan Harper,161608,Dylan,Harper,2006-03-02,78,215,Don Bosco Preparatory HS,NJ,Franklin Lakes,NJ,USA,USA,https://www.draftexpress.com/blue/images/heads...,Rutgers,NCAA D-I,NCAA
2,Ace Bailey,161710,Ace,Bailey,2006-08-13,82,210,McEachern HS,GA,Chattanooga,TN,USA,USA,,Rutgers,NCAA D-I,NCAA
3,VJ Edgecombe,181834,VJ,Edgecombe,2005-07-30,77,188,Long Island Lutheran HS,NY,Alice Town,,Bahamas,Bahamas,https://www.draftexpress.com/blue/images/heads...,Baylor,NCAA D-I,NCAA
4,Kasparas Jakucionis,140378,Kasparas,Jakucionis,2006-05-29,78,205,,,Vilnius,,Lithuania,Lithuania,https://www.draftexpress.com/blue/images/heads...,Illinois,NCAA D-I,NCAA


In [6]:
# How many players total?
len(bio_df)

60

In [7]:
# Check for missing data
measurements_df.isnull().sum()

playerId           0
heightNoShoes      1
heightShoes        0
wingspan           1
reach              1
maxVertical        9
noStepVertical     9
weight             1
bodyFat           54
handLength         3
handWidth          3
agility           12
sprint            12
shuttleLeft       56
shuttleRight      56
shuttleBest       17
dtype: int64

In [11]:
# Check for missing data
bio_df.isnull().sum()

name                0
playerId            0
firstName           0
lastName            0
birthDate           0
height              0
weight              0
highSchool         15
highSchoolState    19
homeTown            0
homeState          22
homeCountry         0
nationality         0
photoUrl           12
currentTeam         0
league              0
leagueType          0
dtype: int64

In [9]:
# Which players have the most games logged?
game_logs_df['playerId'].value_counts().head(10)

playerId
99451     10
118028    10
123733    10
124543    10
129665    10
129693    10
130696    10
136466    10
138145    10
139181    10
Name: count, dtype: int64

In [10]:
# Longest scouting reports
scouting_df['reportLength'] = scouting_df['report'].str.split().str.len()
scouting_df.sort_values('reportLength', ascending=False).head(5)

Unnamed: 0,scout,reportId,playerId,report,reportLength
0,Sam Vecenie,32fb9c99-733a-4f5a-8142-912c8aabb364,161710,"Lorem ipsum dolor sit amet, consectetur adipis...",69
1,Sam Vecenie,ed7bcf33-cf7b-4103-88ea-7eaadedd8a40,181834,"Lorem ipsum dolor sit amet, consectetur adipis...",69
2,Kyle Boone,6b682d97-ae9a-461a-b5e0-bb9a77b349b3,181834,"Lorem ipsum dolor sit amet, consectetur adipis...",69
3,Sam Vecenie,1c0702c5-4a24-404c-b2cc-343be16ae5a7,161852,"Lorem ipsum dolor sit amet, consectetur adipis...",69
4,Kyle Boone,03277a94-63a7-452f-a910-ea1c8c9029c0,164614,"Lorem ipsum dolor sit amet, consectetur adipis...",69


In [19]:
from collections import Counter

# Count both player's own team and opponent
team_counts = Counter()

for game in game_logs_df.to_dict(orient="records"):
    team_counts[game["team"]] += 1
    team_counts[game["opponent"]] += 1

# Convert to DataFrame and sort
most_common_teams = pd.DataFrame(team_counts.items(), columns=["Team", "Appearances"])
most_common_teams = most_common_teams.sort_values(by="Appearances", ascending=False)

most_common_teams.head(60)

Unnamed: 0,Team,Appearances
27,Duke,71
9,Arkansas,35
46,North Carolina,33
4,Connecticut,27
10,Texas Tech,26
83,Illinois,25
76,Florida,22
41,Houston,22
129,Rutgers,21
43,Arizona,20


In [20]:
# Count how many players have each nationality
nationality_counts = bio_df['nationality'].value_counts()

# Convert to DataFrame for easy viewing
nationality_counts_df = nationality_counts.reset_index()
nationality_counts_df.columns = ['Nationality', 'Player Count']

# Show top 20
nationality_counts_df

Unnamed: 0,Nationality,Player Count
0,USA,33
1,France,5
2,Australia,4
3,Israel,2
4,Spain,2
5,Croatia,2
6,Dominican Republic,2
7,Russia,1
8,Bahamas,1
9,Canada,1
