In [197]:
import numpy as np
import os
import pandas as pd
import re

In [214]:
def list_dir(path):
    files = os.listdir(path)
    return [string for string in files if not string.startswith('.')]


def year2szn(year):
    return f"{str(year-1)}-{str(year)[-2:]}"


def combine_csvs(data_folder):
    ''' Group all csvs by team
        CSVs included are:
        - szn_totals.csv
        - advanced.csv
        - shooting.csv
        - per100poss.csv
        - standing.csv
        - szn_totals_against.csv'''
    dfs = []
    file_names = ['szn_totals.csv', 'advanced.csv', 'shooting.csv', 'szn_per100poss.csv', 'standings.csv', 'szn_totals_against.csv']
    merge_column = 'Team'
    for subfolder in list_dir(data_folder):
        folderpath = os.path.join(data_folder, subfolder)
        if os.path.isdir(folderpath):
            csv_path = os.path.join(folderpath, file_names[0])
            # Check if 'szn_totals.csv' exists in the current subfolder
            if os.path.exists(csv_path):
                # Read the CSV file into a pandas DataFrame
                merged_df = pd.read_csv(csv_path)

                #loop through rest of files
                for file_name in file_names[1:]:
                    # Read the current file
                    current_csv_path = os.path.join(folderpath, file_name)
                    if os.path.exists(current_csv_path):
                        current_df = pd.read_csv(current_csv_path)
                        if 'Team' in current_df.iloc[0].values:
                            current_df.columns = current_df.iloc[0]
                            current_df.drop(current_df.index[0], inplace=True)  
                        # Merge the current DataFrame with the merged DataFrame
                        merged_df = pd.merge(merged_df, current_df, on=merge_column, how='outer',suffixes=('', '_'+file_name[:-4]))
    
                # Add a new column with the year
                merged_df['Year'] = subfolder
                
                merged_df = merged_df.drop(merged_df.columns[0], axis=1)
                merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
                merged_df = merged_df.loc[:, merged_df.columns.notna()]
                merged_df = merged_df.drop(merged_df.columns[merged_df.columns.str.contains('^Unnamed:')], axis=1)
                
                merged_df = merged_df.reset_index(drop=True)
                # Append the modified DataFrame to the list
                dfs.append(merged_df)

    # Concatenate all the DataFrames in the list
    concatenated_df = pd.concat(dfs, ignore_index=True)
    concatenated_df = concatenated_df.sort_values(['Year', 'Rk']).reset_index(drop=True)
    return concatenated_df


def gather_years(year: int, previous_years: int) -> tuple[pd.DataFrame, pd.Series]:
    ''' Create a function that uses the current year as its validation set and previous years
    as training data'''
    year_string = year2szn(year)  # 2023 -> "2022-23"
    csvs_val = listdir(f'./data/{year_string}')
    y = gather_years(csvs_val)
    for i in range(1, previous_years):
        cur_year = year-i
        
    X = ...
    return X, y

In [215]:
all_years = combine_csvs('./data/')

In [216]:
all_years.head()

Unnamed: 0,Rk,Team,G,MP,FG,FGA,FG%,3P,3PA,3P%,...,0-3,3-10,10-16,16-3P,3P_shooting,%FGA,Md.,%3PA,3P%_shooting,Att.
0,1.0,Boston Celtics*,69.0,,2065.0,5607.0,0.368,,,,...,,,,,,,,,,
1,2.0,Rochester Royals*,68.0,,2032.0,5377.0,0.378,,,,...,,,,,,,,,,
2,3.0,Tri-Cities Blackhawks,68.0,,1988.0,6041.0,0.329,,,,...,,,,,,,,,,
3,4.0,Fort Wayne Pistons*,68.0,,2002.0,5927.0,0.338,,,,...,,,,,,,,,,
4,5.0,Syracuse Nationals*,66.0,,1884.0,5365.0,0.351,,,,...,,,,,,,,,,
