In [1]:
# Importing and reading in data (change to html later)

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re
import os
from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor

In [4]:
# Preprocessing (change to html later)

In [2]:
# Creating list of team names for looping through function
team_names = ['ARI', 'ATL', 'BAL', 'BOS', 'CHC', 'CHW', 'CIN', 'CLE', 'COL','DET', 'HOU', 'KC', 'LAA', 'LAD', 'MIA', 
              'MIL', 'MIN', 'NYM', 'NYY', 'OAK', 'PHI', 'PIT', 'SD', 'SEA', 'SF', 'STL', 'TB', 'TEX', 'TOR', 'WAS']

In [3]:
def df_preprocessing(team_name) :
    
    batting_path = os.path.join('/Users/jantmann17/Desktop/Portfolio-Projects/MLB-Expansion-Team-Project/team-player-data/Offense', f'{team_name}batting.csv')
    pitching_path = os.path.join('/Users/jantmann17/Desktop/Portfolio-Projects/MLB-Expansion-Team-Project/team-player-data/Pitching', f'{team_name}pitching.csv')
    roster_path = os.path.join('/Users/jantmann17/Desktop/Portfolio-Projects/MLB-Expansion-Team-Project/team-player-data/Current Rosters', f'{team_name}roster.csv')
    
    batting_df = pd.read_csv(batting_path)
    pitching_df = pd.read_csv(pitching_path)
    roster_df = pd.read_csv(roster_path) 
    
# Roster preprocessing
    roster_df = roster_df.drop(columns = ['Rk', 'Uni', 'Unnamed: 3', 'Unnamed: 4', 'OnActv', 'IL', 'DoB', '1stYr', 'Name-additional'])
    removable_substrings = ['*', '#', '?', ' (15-day IL)', ' (7-day IL)', ' (10-day IL)', ' (60-day IL)', ' (40-man)']
    name_pattern = '|'.join(map(re.escape, removable_substrings))
    roster_df['Name'] = roster_df['Name'].str.replace(name_pattern, '')
    roster_df.dropna(how = 'all', inplace = True)
    excluded_columns_roster = ['Name', 'B', 'T', 'Ht']
    for column in roster_df.columns :
        if column not in excluded_columns_roster :
            roster_df[column] = roster_df[column].astype(int)

# Batting preprocessing
    batting_df = batting_df.rename(columns = {'Pos' : 'Position'}).drop(columns = ['Rk', 'Name-additional'])
    removable_substrings = ['*', '#', '?', ' (15-day IL)', ' (10-day IL)', ' (60-day IL)', ' (40-man)', ' (7-day IL)', ' (DFA)']
    name_pattern = '|'.join(map(re.escape, removable_substrings))
    batting_df['Name'] = batting_df['Name'].str.replace(name_pattern, '')
    batting_df.dropna(how = 'all', inplace = True)
    excluded_columns_bat = ['Position', 'Name', 'BA', 'OBP', 'SLG', 'OPS']
    for column in batting_df.columns :
        if column not in excluded_columns_bat :
            batting_df[column] = pd.to_numeric(batting_df[column], errors = 'coerce')
            batting_df[column].fillna(0, inplace = True)
            batting_df[column] = batting_df[column].astype(int)
    batting_df = batting_df.drop(batting_df[batting_df['Position'] == 'P'].index)
    batting_df.insert(1, 'Team', team_name)

# Pitching preprocessing
    pitching_df = pitching_df.rename(columns = {'Pos' : 'Position'}).drop(columns = ['Rk', 'Name-additional'])
    removable_substrings = ['*', '#', '?', ' (15-day IL)', ' (7-day IL)', ' (10-day IL)', ' (60-day IL)', ' (40-man)', ' (DFA)']
    name_pattern = '|'.join(map(re.escape, removable_substrings))
    pitching_df['Name'] = pitching_df['Name'].str.replace(name_pattern, '')
    pitching_df.dropna(how = 'all', inplace = True)
    pitching_df.dropna(subset = ['W-L%', 'SO/W'], inplace = True)
    pitching_df['Position'].fillna('P', inplace = True)
    excluded_columns_pitch = ['Position', 'Name', 'IP', 'FIP', 'WHIP', 'H9', 'HR9', 'BB9', 'SO9']
    for column in pitching_df.columns :
        if column not in excluded_columns_pitch :
            pitching_df[column] = pitching_df[column].astype(int)
    pitching_df.insert(1, 'Team', team_name)
    
    batting_df[batting_df['Name'].isin(roster_df['Name'])].reset_index(drop = True, inplace = True)
    pitching_df[pitching_df['Name'].isin(roster_df['Name'])].reset_index(drop = True, inplace = True)
    
# Saving updated dataframes

    filtered_batting_path = os.path.join('/Users/jantmann17/Desktop/Portfolio-Projects/MLB-Expansion-Team-Project/team-player-data/Filtered_Offense', f'{team_name}batting_filtered.csv')
    filtered_pitching_path = os.path.join('/Users/jantmann17/Desktop/Portfolio-Projects/MLB-Expansion-Team-Project/team-player-data/Filtered_Pitching', f'{team_name}pitching_filtered.csv')
    batting_df.to_csv(filtered_batting_path, index = False)
    pitching_df.to_csv(filtered_pitching_path, index = False)

In [4]:
if __name__ == '__main__' :
    num_threads = 30

    with ThreadPoolExecutor(max_workers = num_threads) as executor :
# Use the map function to apply the df_preprocessing function to each team concurrently
        try :
            executor.map(df_preprocessing, team_names)
        except Exception as e :
            print(f"Error occurred: {e}")

# Merging the filtered data for each team into a single DataFrame if needed
    batting_dfs = []
    for team_name in team_names:
        filtered_batting_path = f'/Users/jantmann17/Desktop/Portfolio-Projects/MLB-Expansion-Team-Project/team-player-data/Filtered_Offense/{team_name}batting_filtered.csv'
        df = pd.read_csv(filtered_batting_path)
        batting_dfs.append(df)
    batting_df = pd.concat(batting_dfs)
    
    pitching_dfs = []
    for team_name in team_names:
        filtered_pitching_path = f'/Users/jantmann17/Desktop/Portfolio-Projects/MLB-Expansion-Team-Project/team-player-data/Filtered_Pitching/{team_name}pitching_filtered.csv'
        df = pd.read_csv(filtered_pitching_path)
        pitching_dfs.append(df)
    pitching_df = pd.concat(pitching_dfs)

  roster_df['Name'] = roster_df['Name'].str.replace(name_pattern, '')
  roster_df['Name'] = roster_df['Name'].str.replace(name_pattern, '')
  batting_df['Name'] = batting_df['Name'].str.replace(name_pattern, '')
  batting_df['Name'] = batting_df['Name'].str.replace(name_pattern, '')
  roster_df['Name'] = roster_df['Name'].str.replace(name_pattern, '')
  batting_df['Name'] = batting_df['Name'].str.replace(name_pattern, '')
  roster_df['Name'] = roster_df['Name'].str.replace(name_pattern, '')
  roster_df['Name'] = roster_df['Name'].str.replace(name_pattern, '')
  roster_df['Name'] = roster_df['Name'].str.replace(name_pattern, '')
  roster_df['Name'] = roster_df['Name'].str.replace(name_pattern, '')
  batting_df['Name'] = batting_df['Name'].str.replace(name_pattern, '')
  batting_df['Name'] = batting_df['Name'].str.replace(name_pattern, '')
  batting_df['Name'] = batting_df['Name'].str.replace(name_pattern, '')
  batting_df['Name'] = batting_df['Name'].str.replace(name_pattern, '')
  rost

In [6]:
# Quick reset of indices for concatenated dataframes
batting_df.reset_index(drop = True, inplace = True)
pitching_df.reset_index(drop = True, inplace = True)

In [9]:
batting_df

Unnamed: 0,Position,Team,Name,Age,G,PA,AB,R,H,2B,...,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB
0,C,ARI,Gabriel Moreno,23,72,240,218,17,60,11,...,0.321,0.367,0.688,90,80,9,1,0,5,0
1,1B,ARI,Christian Walker,32,93,397,350,52,92,28,...,0.338,0.514,0.853,132,180,9,3,0,4,2
2,2B,ARI,Ketel Marte,29,90,394,348,65,98,18,...,0.358,0.491,0.849,132,171,11,5,0,3,1
3,SS,ARI,Geraldo Perdomo,23,79,294,241,41,66,15,...,0.381,0.407,0.788,119,98,1,3,8,2,1
4,3B,ARI,Josh Rojas,29,57,210,183,23,43,13,...,0.301,0.306,0.607,69,56,2,1,2,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591,C,WAS,Riley Adams,27,23,87,79,4,22,5,...,0.337,0.506,0.844,134,40,4,1,1,0,0
592,IF,WAS,Michael Chavis,27,26,54,51,8,13,1,...,0.296,0.333,0.630,78,17,1,0,0,0,0
593,CF,WAS,Derek Hill,27,13,50,47,3,8,1,...,0.220,0.191,0.411,18,9,0,0,0,0,0
594,LF,WAS,Jake Alu,26,5,15,14,0,3,0,...,0.267,0.214,0.481,38,3,0,0,0,0,0
