In [1]:
import pandas as pd
from tqdm import tqdm
%load_ext autotime

In [2]:
def preprocess_data(data):
    # Remove page breaks
    data = data[data['Rk'] != 'Rk']
    
    # Remove players individual team stats (only keep row with total stats)
    data = data.drop_duplicates(subset=['Player'])
    
    # Remove percentage columns
    non_percentage_columns = [c for c in data.columns if '%' not in c]
    data = data[non_percentage_columns]
    
    # Only keep players who are not listed under multiple positions
    data = data[data['Pos'].isin(['PG', 'SG', 'SF', 'PF', 'C'])]
    
    return data

time: 3.62 ms


In [4]:
all_preprocessed_data = []
for year in tqdm(range(2000,2020)):
    # Read data
    raw_data = pd.read_html("https://www.basketball-reference.com/leagues/NBA_{}_totals.html".format(year))[0]
    
    raw_data.to_csv("../data/raw_data/{}.csv".format(year), index=False)
    
    preprocessed_data = preprocess_data(raw_data)
    
    preprocessed_data.to_csv("../data/preprocessed_data/{}.csv".format(year), index=False)
    
    all_preprocessed_data.append(preprocessed_data)

time: 994 µs


In [61]:
all_preprocessed_df = pd.concat(all_preprocessed_data, axis=0)
all_preprocessed_df.to_csv("../data/all_preprocessed_data.csv", index=False)

time: 284 ms


In [5]:
df = pd.read_csv("../data/all_preprocessed_data.csv")

time: 104 ms


In [7]:
# Remove players who did not play any minutes and divide all total stats by minutes played to put them on a per minute scale
df = df[df.MP != 0]
df.iloc[:,8:] = df.iloc[:,8:].div(df.MP, axis=0)

time: 87.5 ms


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [6]:
df

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FTA,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Tariq Abdul-Wahad,SG,25,TOT,61,56,1578,274,646,...,193,101,190,291,98,59,28,106,147,697
1,2,Shareef Abdur-Rahim,SF,23,VAN,82,82,3223,594,1277,...,551,218,607,825,271,89,87,249,244,1663
2,3,Cory Alexander,PG,26,DEN,29,2,329,28,98,...,22,8,34,42,58,24,2,28,39,82
3,4,Ray Allen*,SG,24,MIL,82,82,3070,642,1411,...,398,83,276,359,308,110,19,183,187,1809
4,5,Rafer Alston,PG,23,MIL,27,0,361,27,95,...,4,5,18,23,70,12,0,29,29,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9139,526,Trae Young,PG,20,ATL,81,81,2503,525,1256,...,414,64,237,301,653,72,15,308,140,1549
9140,527,Cody Zeller,C,26,CHO,49,47,1243,190,345,...,141,110,223,333,102,38,41,62,164,497
9141,528,Tyler Zeller,C,29,TOT,6,1,93,16,30,...,18,11,13,24,4,1,3,4,20,46
9142,529,Ante Žižić,C,22,CLE,59,25,1082,183,331,...,132,108,212,320,53,13,22,61,113,459


time: 99.7 ms
