# libraries

In [31]:
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
from functools import reduce
from math import log
from decimal import Decimal
import altair as alt
import kagglehub as kh

# import data

## Louisville roster data

In [36]:
lville_roster_11_15 = pd.read_csv('./data/lville-roster_11_15.csv')

In [37]:
lville_roster_11_15

Unnamed: 0,Player,#,Class,Pos,Height,Weight,Hometown,RSCI Top 100,Summary,Season
0,Kyle Kuric,14,SR,G,6-4,195,Evansville IN,,12.6 Pts 4.2 Reb 1.2 Ast,2011-12
1,Russ Smith,2,SO,G,6-0,165,Brooklyn NY,,11.5 Pts 2.5 Reb 1.9 Ast,2011-12
2,Chris Smith,5,SR,G,6-2,195,Millstone NJ,,9.7 Pts 3.6 Reb 1.9 Ast,2011-12
3,Chane Behanan,21,FR,F,6-6,250,Cincinnati OH,24 (2011),9.5 Pts 7.5 Reb 0.8 Ast,2011-12
4,Gorgui Dieng,10,SO,C,6-11,245,"Kébémer, Senegal",69 (2010),9.1 Pts 9.1 Reb 1.1 Ast,2011-12
...,...,...,...,...,...,...,...,...,...,...
56,David Levitch,23,SO,G,6-3,180,Goshen KY,,1.1 Pts 0.3 Reb 0.3 Ast,2014-15
57,Trent Gilbert,15,FR,G,5-10,180,Georgetown KY,,1.5 Pts 0.5 Reb 0.5 Ast,2014-15
58,Dillon Avare,4,FR,G,6-0,150,Lexington KY,,0.5 Pts 0.5 Reb 0.0 Ast,2014-15
59,Matz Stockman,5,FR,C,7-0,245,Oslo Norway,,1.0 Pts 2.8 Reb 0.3 Ast,2014-15


In [38]:
# create a new column that is No if RSCI Top 100 is Naan else yes
lville_roster_11_15['Ranked'] = lville_roster_11_15['RSCI Top 100'].apply(lambda x: 'No' if pd.isna(x) else 'Yes')

# fill in nans in RSCI Top 100 with Unranked
lville_roster_11_15['RSCI Top 100'] = lville_roster_11_15['RSCI Top 100'].fillna('Unranked')

#split on space before a number and create 3 new columns for Pts, Reb, Ast
lville_roster_11_15[['PPG', 'RPG', 'APG']] = lville_roster_11_15['Summary'].str.extract(r'(\d+\.?\d*) Pts (\d+\.?\d*) Reb (\d+\.?\d*) Ast').astype(float)

# remove summary column
lville_roster_11_15 = lville_roster_11_15.drop(columns=['Summary'])

# change height to cm
lville_roster_11_15['Height_cm'] = lville_roster_11_15['Height'].str.split('-').apply(lambda x: round(int(x[0]) * 30.48 + int(x[1]) * 2.54, 1))

# change hometown to 2 columns city and state splitting on space if 
lville_roster_11_15['International'] = lville_roster_11_15['Hometown'].apply(lambda x: 'No' if isinstance(x, str) and x[-2:].isupper() else 'Yes')

lville_roster_11_15


Unnamed: 0,Player,#,Class,Pos,Height,Weight,Hometown,RSCI Top 100,Season,Ranked,PPG,RPG,APG,Height_cm,International
0,Kyle Kuric,14,SR,G,6-4,195,Evansville IN,Unranked,2011-12,No,12.6,4.2,1.2,193.0,No
1,Russ Smith,2,SO,G,6-0,165,Brooklyn NY,Unranked,2011-12,No,11.5,2.5,1.9,182.9,No
2,Chris Smith,5,SR,G,6-2,195,Millstone NJ,Unranked,2011-12,No,9.7,3.6,1.9,188.0,No
3,Chane Behanan,21,FR,F,6-6,250,Cincinnati OH,24 (2011),2011-12,Yes,9.5,7.5,0.8,198.1,No
4,Gorgui Dieng,10,SO,C,6-11,245,"Kébémer, Senegal",69 (2010),2011-12,Yes,9.1,9.1,1.1,210.8,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,David Levitch,23,SO,G,6-3,180,Goshen KY,Unranked,2014-15,No,1.1,0.3,0.3,190.5,No
57,Trent Gilbert,15,FR,G,5-10,180,Georgetown KY,Unranked,2014-15,No,1.5,0.5,0.5,177.8,No
58,Dillon Avare,4,FR,G,6-0,150,Lexington KY,Unranked,2014-15,No,0.5,0.5,0.0,182.9,No
59,Matz Stockman,5,FR,C,7-0,245,Oslo Norway,Unranked,2014-15,No,1.0,2.8,0.3,213.4,Yes


In [39]:
# remove duplicate player names, taking first instance
lville_roster_11_15 = lville_roster_11_15.drop_duplicates(subset=['Player'], keep='first')

lville_roster_11_15

Unnamed: 0,Player,#,Class,Pos,Height,Weight,Hometown,RSCI Top 100,Season,Ranked,PPG,RPG,APG,Height_cm,International
0,Kyle Kuric,14,SR,G,6-4,195,Evansville IN,Unranked,2011-12,No,12.6,4.2,1.2,193.0,No
1,Russ Smith,2,SO,G,6-0,165,Brooklyn NY,Unranked,2011-12,No,11.5,2.5,1.9,182.9,No
2,Chris Smith,5,SR,G,6-2,195,Millstone NJ,Unranked,2011-12,No,9.7,3.6,1.9,188.0,No
3,Chane Behanan,21,FR,F,6-6,250,Cincinnati OH,24 (2011),2011-12,Yes,9.5,7.5,0.8,198.1,No
4,Gorgui Dieng,10,SO,C,6-11,245,"Kébémer, Senegal",69 (2010),2011-12,Yes,9.1,9.1,1.1,210.8,Yes
5,Peyton Siva,3,JR,G,6-0,185,Seattle WA,27 (2009),2011-12,Yes,9.1,3.2,5.6,182.9,No
6,Jared Swopshire,12,JR,F,6-8,210,St. Louis MO,Unranked,2011-12,No,3.3,2.8,0.4,203.2,No
7,Rakeem Buckles,5,JR,F,6-7,215,Miami FL,54 (2009),2011-12,Yes,4.0,3.8,0.3,200.7,No
8,Wayne Blackshear,25,FR,G,6-5,215,Chicago IL,26 (2011),2011-12,Yes,2.5,1.4,0.1,195.6,No
9,Angel Nunez,2,FR,F,6-8,202,Washington Heights NY,Unranked,2011-12,No,2.0,0.7,0.2,203.2,No


In [40]:
# create new column that is State if last 2 letters of Hometown are uppercase else International
lville_roster_11_15['State'] = lville_roster_11_15['Hometown'].apply(lambda x: x[-2:] if isinstance(x, str) and x[-2:].isupper() else 'International')

lville_roster_11_15

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lville_roster_11_15['State'] = lville_roster_11_15['Hometown'].apply(lambda x: x[-2:] if isinstance(x, str) and x[-2:].isupper() else 'International')


Unnamed: 0,Player,#,Class,Pos,Height,Weight,Hometown,RSCI Top 100,Season,Ranked,PPG,RPG,APG,Height_cm,International,State
0,Kyle Kuric,14,SR,G,6-4,195,Evansville IN,Unranked,2011-12,No,12.6,4.2,1.2,193.0,No,IN
1,Russ Smith,2,SO,G,6-0,165,Brooklyn NY,Unranked,2011-12,No,11.5,2.5,1.9,182.9,No,NY
2,Chris Smith,5,SR,G,6-2,195,Millstone NJ,Unranked,2011-12,No,9.7,3.6,1.9,188.0,No,NJ
3,Chane Behanan,21,FR,F,6-6,250,Cincinnati OH,24 (2011),2011-12,Yes,9.5,7.5,0.8,198.1,No,OH
4,Gorgui Dieng,10,SO,C,6-11,245,"Kébémer, Senegal",69 (2010),2011-12,Yes,9.1,9.1,1.1,210.8,Yes,International
5,Peyton Siva,3,JR,G,6-0,185,Seattle WA,27 (2009),2011-12,Yes,9.1,3.2,5.6,182.9,No,WA
6,Jared Swopshire,12,JR,F,6-8,210,St. Louis MO,Unranked,2011-12,No,3.3,2.8,0.4,203.2,No,MO
7,Rakeem Buckles,5,JR,F,6-7,215,Miami FL,54 (2009),2011-12,Yes,4.0,3.8,0.3,200.7,No,FL
8,Wayne Blackshear,25,FR,G,6-5,215,Chicago IL,26 (2011),2011-12,Yes,2.5,1.4,0.1,195.6,No,IL
9,Angel Nunez,2,FR,F,6-8,202,Washington Heights NY,Unranked,2011-12,No,2.0,0.7,0.2,203.2,No,NY


In [41]:
# take only first 2 numbers in RSCI Top 100 if value starts with a number, if value = Unranked keep value
lville_roster_11_15['RSCI Top 100'] = lville_roster_11_15['RSCI Top 100'].apply(lambda x: x[:2] if isinstance(x, str) and x[0].isdigit() else x if x != 'Unranked' else x)

lville_roster_11_15

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lville_roster_11_15['RSCI Top 100'] = lville_roster_11_15['RSCI Top 100'].apply(lambda x: x[:2] if isinstance(x, str) and x[0].isdigit() else x if x != 'Unranked' else x)


Unnamed: 0,Player,#,Class,Pos,Height,Weight,Hometown,RSCI Top 100,Season,Ranked,PPG,RPG,APG,Height_cm,International,State
0,Kyle Kuric,14,SR,G,6-4,195,Evansville IN,Unranked,2011-12,No,12.6,4.2,1.2,193.0,No,IN
1,Russ Smith,2,SO,G,6-0,165,Brooklyn NY,Unranked,2011-12,No,11.5,2.5,1.9,182.9,No,NY
2,Chris Smith,5,SR,G,6-2,195,Millstone NJ,Unranked,2011-12,No,9.7,3.6,1.9,188.0,No,NJ
3,Chane Behanan,21,FR,F,6-6,250,Cincinnati OH,24,2011-12,Yes,9.5,7.5,0.8,198.1,No,OH
4,Gorgui Dieng,10,SO,C,6-11,245,"Kébémer, Senegal",69,2011-12,Yes,9.1,9.1,1.1,210.8,Yes,International
5,Peyton Siva,3,JR,G,6-0,185,Seattle WA,27,2011-12,Yes,9.1,3.2,5.6,182.9,No,WA
6,Jared Swopshire,12,JR,F,6-8,210,St. Louis MO,Unranked,2011-12,No,3.3,2.8,0.4,203.2,No,MO
7,Rakeem Buckles,5,JR,F,6-7,215,Miami FL,54,2011-12,Yes,4.0,3.8,0.3,200.7,No,FL
8,Wayne Blackshear,25,FR,G,6-5,215,Chicago IL,26,2011-12,Yes,2.5,1.4,0.1,195.6,No,IL
9,Angel Nunez,2,FR,F,6-8,202,Washington Heights NY,Unranked,2011-12,No,2.0,0.7,0.2,203.2,No,NY


In [42]:
# save as a excel file
lville_roster_11_15.to_excel('lville-roster_11_15_cleaned.xlsx', index=False)

lville_roster_11_15

Unnamed: 0,Player,#,Class,Pos,Height,Weight,Hometown,RSCI Top 100,Season,Ranked,PPG,RPG,APG,Height_cm,International,State
0,Kyle Kuric,14,SR,G,6-4,195,Evansville IN,Unranked,2011-12,No,12.6,4.2,1.2,193.0,No,IN
1,Russ Smith,2,SO,G,6-0,165,Brooklyn NY,Unranked,2011-12,No,11.5,2.5,1.9,182.9,No,NY
2,Chris Smith,5,SR,G,6-2,195,Millstone NJ,Unranked,2011-12,No,9.7,3.6,1.9,188.0,No,NJ
3,Chane Behanan,21,FR,F,6-6,250,Cincinnati OH,24,2011-12,Yes,9.5,7.5,0.8,198.1,No,OH
4,Gorgui Dieng,10,SO,C,6-11,245,"Kébémer, Senegal",69,2011-12,Yes,9.1,9.1,1.1,210.8,Yes,International
5,Peyton Siva,3,JR,G,6-0,185,Seattle WA,27,2011-12,Yes,9.1,3.2,5.6,182.9,No,WA
6,Jared Swopshire,12,JR,F,6-8,210,St. Louis MO,Unranked,2011-12,No,3.3,2.8,0.4,203.2,No,MO
7,Rakeem Buckles,5,JR,F,6-7,215,Miami FL,54,2011-12,Yes,4.0,3.8,0.3,200.7,No,FL
8,Wayne Blackshear,25,FR,G,6-5,215,Chicago IL,26,2011-12,Yes,2.5,1.4,0.1,195.6,No,IL
9,Angel Nunez,2,FR,F,6-8,202,Washington Heights NY,Unranked,2011-12,No,2.0,0.7,0.2,203.2,No,NY
