In [41]:
import numpy as np
import os
import pandas as pd
import re

In [74]:
def list_dir(path):
    files = os.listdir(path)
    return [string for string in files if not string.startswith('.')]


def year2szn(year):
    return f"{str(year-1)}-{str(year)[-2:]}"


def combine_csvs(data_folder):
    ''' Group all csvs by team
        CSVs included are:
        - szn_totals.csv
        - advanced.csv
        - shooting.csv
        - per100poss.csv
        - standing.csv
        - szn_totals_against.csv'''
    dfs = []
    file_names = ['szn_totals.csv','szn_totals_against.csv','advanced.csv',
                  'shooting.csv','shooting_against.csv',
                  'szn_per100poss.csv','szn_per100poss_against.csv', 'standings.csv'
                 ]
    merge_column = 'Team'
    for subfolder in list_dir(data_folder):
        folderpath = os.path.join(data_folder, subfolder)
        if os.path.isdir(folderpath):
            csv_path = os.path.join(folderpath, file_names[0])
            # Check if 'szn_totals.csv' exists in the current subfolder
            if os.path.exists(csv_path):
                # Read the CSV file into a pandas DataFrame
                merged_df = pd.read_csv(csv_path)
                merged_df[merge_column] = [x.strip('*') for x in merged_df[merge_column]]
                #loop through rest of files
                for file_name in file_names[1:]:
                    # Read the current file
                    current_csv_path = os.path.join(folderpath, file_name)
                    if os.path.exists(current_csv_path):
                        current_df = pd.read_csv(current_csv_path)
                        if 'Team' in current_df.iloc[0].values:
                            current_df.columns = current_df.iloc[0]
                            current_df.drop(current_df.index[0], inplace=True)  
                        # Merge the current DataFrame with the merged DataFrame
                        if (subfolder=='2022-23') & (file_name=='standings.csv'):
                            current_df[merge_column] = [x[:-5] for x in current_df[merge_column]]
                        current_df[merge_column] = [x.strip('*') for x in current_df[merge_column]]
                        merged_df = pd.merge(merged_df, current_df, on=merge_column, how='outer',suffixes=('', '_'+file_name[:-4]))
    
                # Add a new column with the year
                merged_df['Year'] = subfolder
                
                merged_df = merged_df.drop(merged_df.columns[0], axis=1)
                merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
                merged_df = merged_df.loc[:, merged_df.columns.notna()]
                merged_df = merged_df.drop(merged_df.columns[merged_df.columns.str.contains('^Unnamed:')], axis=1)
                
                merged_df = merged_df.reset_index(drop=True)
                # Append the modified DataFrame to the list
                dfs.append(merged_df)

    # Concatenate all the DataFrames in the list
    concatenated_df = pd.concat(dfs, ignore_index=True)
    concatenated_df = concatenated_df.sort_values(['Year', 'Rk']).reset_index(drop=True)
    return concatenated_df


def gather_years(year: int, previous_years: int) -> tuple[pd.DataFrame, pd.Series]:
    ''' Create a function that uses the current year as its validation set and previous years
    as training data'''
    year_string = year2szn(year)  # 2023 -> "2022-23"
    csvs_val = listdir(f'./data/{year_string}')
    y = gather_years(csvs_val)
    for i in range(1, previous_years):
        cur_year = year-i
        
    X = ...
    return X, y

In [75]:
all_years = combine_csvs('./data/')

In [76]:
all_years[all_years['Year']=='2022-23']

Unnamed: 0,Rk,Team,G,MP,FG,FGA,FG%,3P,3PA,3P%,...,2P_shooting_against,0-3_shooting_against,3-10_shooting_against,10-16_shooting_against,16-3P_shooting_against,3P_shooting_against,%FGA_shooting_against,Md._shooting_against,%3PA_shooting_against,3P%_shooting_against
2557,1.0,Sacramento Kings,82.0,19830.0,3573.0,7232.0,0.494,1128.0,3060.0,0.369,...,,,,,,,,,,
2558,2.0,Golden State Warriors,82.0,19830.0,3538.0,7393.0,0.479,1363.0,3540.0,0.385,...,,,,,,,,,,
2559,3.0,Atlanta Hawks,82.0,19855.0,3658.0,7574.0,0.483,882.0,2505.0,0.352,...,,,,,,,,,,
2560,4.0,Boston Celtics,82.0,19980.0,3460.0,7278.0,0.475,1315.0,3492.0,0.377,...,,,,,,,,,,
2561,5.0,Oklahoma City Thunder,82.0,19855.0,3533.0,7590.0,0.465,995.0,2797.0,0.356,...,,,,,,,,,,
2562,6.0,Los Angeles Lakers,82.0,19880.0,3516.0,7298.0,0.482,885.0,2558.0,0.346,...,,,,,,,,,,
2563,7.0,Utah Jazz,82.0,19805.0,3485.0,7365.0,0.473,1094.0,3099.0,0.353,...,0.623,0.213,0.259,0.09,0.061,0.377,0.064,418.0,0.209,0.401
2564,8.0,Milwaukee Bucks,82.0,19830.0,3504.0,7411.0,0.473,1217.0,3306.0,0.368,...,,,,,,,,,,
2565,9.0,Memphis Grizzlies,82.0,19780.0,3585.0,7551.0,0.475,985.0,2807.0,0.351,...,,,,,,,,,,
2566,10.0,Indiana Pacers,82.0,19755.0,3444.0,7345.0,0.469,1112.0,3030.0,0.367,...,0.627,0.285,0.196,0.093,0.053,0.373,0.073,459.0,0.272,0.402


In [50]:
for thing in list_dir('./data/2020-21/'):
    frame = pd.read_csv(f'./data/2020-21/{thing}')
    print((frame.columns), thing)

Index(['Unnamed: 0', 'Rk', 'Team', 'G', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA',
       '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB',
       'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'],
      dtype='object') szn_totals_against.csv
Index(['Unnamed: 0', 'Unnamed: 0_level_0', 'Unnamed: 1_level_0',
       'Unnamed: 2_level_0', 'Unnamed: 3_level_0', 'Unnamed: 4_level_0',
       'Unnamed: 5_level_0', 'Unnamed: 6_level_0', 'Unnamed: 7_level_0',
       'Unnamed: 8_level_0', 'Unnamed: 9_level_0', 'Unnamed: 10_level_0',
       'Unnamed: 11_level_0', 'Unnamed: 12_level_0', 'Unnamed: 13_level_0',
       'Unnamed: 14_level_0', 'Unnamed: 15_level_0', 'Unnamed: 16_level_0',
       'Unnamed: 17_level_0', 'Offense Four Factors', 'Offense Four Factors.1',
       'Offense Four Factors.2', 'Offense Four Factors.3',
       'Unnamed: 22_level_0', 'Defense Four Factors', 'Defense Four Factors.1',
       'Defense Four Factors.2', 'Defense Four Factors.3',
       'Unnamed: 27_level_0', 'Unname

In [53]:
def show_list(df) -> None:
    '''prints each columns name and its respective index in df.columns'''
    for x in range(len(df.columns)):
        print(x, '-', df.columns[x])

In [None]:
all_years()

In [62]:
all_years.to_csv('./training.csv')

In [61]:
!pwd

/Users/alexbradshaw/Desktop/0DL/predict-perform
