In [4]:
import os
import pandas as pd

def load_and_save_mlb_data(data_directory):
    # Initialize empty DataFrames for each category
    team_pitching = pd.DataFrame()
    team_batting = pd.DataFrame()
    individual_pitching = pd.DataFrame()
    individual_batting = pd.DataFrame()
    mlb_games = pd.DataFrame()

    # Define the categories
    categories = ['team_pitching', 'team_batting', 'individual_pitching', 'individual_batting', 'mlb_games']

    # Iterate through each year directory
    for year in os.listdir(data_directory):
        year_path = os.path.join(data_directory, year)
        if os.path.isdir(year_path):
            for category in categories:
                category_path = os.path.join(year_path, category)
                if os.path.isdir(category_path):
                    for csv_file in os.listdir(category_path):
                        csv_path = os.path.join(category_path, csv_file)
                        if os.path.isfile(csv_path) and csv_path.endswith('.csv'):
                            df = pd.read_csv(csv_path)
                            if category == 'team_pitching':
                                team_pitching = pd.concat([team_pitching, df], ignore_index=True)
                            elif category == 'team_batting':
                                team_batting = pd.concat([team_batting, df], ignore_index=True)
                            elif category == 'individual_pitching':
                                individual_pitching = pd.concat([individual_pitching, df], ignore_index=True)
                            elif category == 'individual_batting':
                                individual_batting = pd.concat([individual_batting, df], ignore_index=True)
                            elif category == 'mlb_games':
                                mlb_games = pd.concat([mlb_games, df], ignore_index=True)

    # Create a subfolder for combined data
    combined_data_directory = os.path.join(data_directory, 'combined_data')
    os.makedirs(combined_data_directory, exist_ok=True)

    # Save the combined DataFrames to CSV files in the combined data directory
    team_pitching.to_csv(os.path.join(combined_data_directory, 'team_pitching.csv'), index=False)
    team_batting.to_csv(os.path.join(combined_data_directory, 'team_batting.csv'), index=False)
    individual_pitching.to_csv(os.path.join(combined_data_directory, 'individual_pitching.csv'), index=False)
    individual_batting.to_csv(os.path.join(combined_data_directory, 'individual_batting.csv'), index=False)
    mlb_games.to_csv(os.path.join(combined_data_directory, 'mlb_games.csv'), index=False)

    print(f"Combined data saved in {combined_data_directory}")

# Example usage:
data_directory = 'C:/Users/jason/Projects/mlb_data'
load_and_save_mlb_data(data_directory)


Combined data saved in C:/Users/jason/Projects/mlb_data\combined_data
