In [1]:
import pandas as pd
import unicodedata
import re
from difflib import get_close_matches

In [2]:
stats_2025=pd.read_csv('../batter_record_1_hit/stats/batter_stats_2025.csv', encoding='utf-8', 
            encoding_errors='replace')  # replaces invalid bytes with �


In [3]:
stats_2024 = pd.read_csv('../batter_record_1_hit/stats/batter_stats_2024.csv')
# stats_2025 = pd.read_csv('../batter_record_1_hit/stats/batter_stats_2025.csv')
def_rank_2024 = pd.read_csv('../batter_rbi_OU/def_rank_era_pitchers_2024.csv')
def_rank_2025=pd.read_csv('../scrapers/def_scraper/batter_runs_def_rank.csv')
batter_team_2024 = pd.read_csv('../batter_record_1_hit/stats/batters_team_2024.csv')
batter_team_2025 = pd.read_csv('../batter_record_1_hit/stats/batters_team_2025.csv')

In [4]:
team_mapping_5 = {
    'CHC': 'CHC',
    'LAD': 'LAD',
    'ARI': 'ARI',
    'PIT': 'PIT',
    'MIA': 'MIA',
    'TOR': 'TOR',
    'BOS': 'BOS',
    'ATH': 'ATH',
    'BAL': 'BAL',
    'SEA': 'SEA',
    'SDP': 'SDP',
    'ATL': 'ATL',
    'PHI': 'PHI',
    'TEX': 'TEX',
    'MIL': 'MIL',
    'WSN': 'WSH',
    'NYY': 'NYY',
    'STL': 'STL',
    'TBR': 'TBR',
    'DET': 'DET',
    'KCR': 'KCR',
    'CLE': 'CLE',
    'COL': 'COL',
    'NYM': 'NYM',
    'CIN': 'CIN',
    'MIN': 'MIN',
    'SFG': 'SFG',
    'HOU': 'HOU',
    'CHW': 'CHW',
    'LAAs': 'LAA'
}

batter_team_2025['team'] = batter_team_2025['team'].map(team_mapping_5)
batter_team_2024['team'] = batter_team_2024['team'].map(team_mapping_5)

In [5]:
def_rank_2025.rename(columns={'Team': 'team'}, inplace=True)

In [6]:
#dropping first half of the 2024 season
stats_2024 = stats_2024.iloc[len(stats_2024)//2:]

In [7]:
#fixing batter_team_2024
batter_team_2024['team'] = batter_team_2024['team'].replace({'WSN': 'WSH'})
batter_team_2025['team'] = batter_team_2025['team'].replace({'WSN': 'WSH'})

In [8]:
stats_2024=stats_2024[['away_hitters','away_R','home_hitters','home_R']]
stats_2025=stats_2025[['away_hitters','away_R','home_hitters','home_R']]

In [9]:
#normalizing the names, getting rid of accents
def normalize_name(name):
    if not isinstance(name, str):
        return name
    return unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('utf-8')

In [10]:
batter_team_2024['player'] = batter_team_2024['player'].apply(normalize_name)
batter_team_2025['player'] = batter_team_2025['player'].apply(normalize_name)

In [11]:
def clean_name_symbols(name):
    if not isinstance(name, str):
        return name
    return name.replace('*', '').replace('#', '')

In [12]:
#applying it to the player collumn
batter_team_2024['player'] = batter_team_2024['player'].apply(clean_name_symbols)
batter_team_2025['player'] = batter_team_2025['player'].apply(clean_name_symbols)

In [None]:
multi_team_flags = ['2TM', '3TM', '4TM', '5TM']
filtered_df = batter_team_2024[~batter_team_2024['team'].isin(multi_team_flags)].copy()
filtered_df_2 = batter_team_2025[~batter_team_2025['team'].isin(multi_team_flags)].copy()


# Step 2: Keep only the LAST team listed for each player (i.e., current team)
batter_team_2024 = filtered_df.drop_duplicates(subset='player', keep='last').reset_index(drop=True)
batter_team_2025 = filtered_df_2.drop_duplicates(subset='player', keep='last').reset_index(drop=True)

In [15]:
#matching the batter_team_2024 with the broken names in stats_2024 
# Step 1: Safely extract short names like "P. Sandoval"
def clean_name(raw):
    if not isinstance(raw, str):
        return None
    match = re.match(r"([A-Z]\.\s[A-Za-z'-]+)", raw)
    return match.group(1) if match else None

stats_2024['short_name'] = stats_2024['away_hitters'].apply(clean_name)

# Step 2: Match short name to full name in pitchers_team
def match_full_name(short_name, full_names):
    if not isinstance(short_name, str):
        return None

    last_name = short_name.split()[-1]
    full_names = [name for name in full_names if isinstance(name, str)]

    # Try to match by last name + same first initial
    possible_matches = [name for name in full_names if last_name in name]
    for name in possible_matches:
        if name[0] == short_name[0]:  # match first initial
            return name

    # Fallback to fuzzy match if nothing matched
    close = get_close_matches(short_name, full_names, n=1, cutoff=0.5)
    return close[0] if close else None

# Step 3: Apply the matching
full_name_list = batter_team_2024['player'].tolist()
stats_2024['matched_name'] = stats_2024['short_name'].apply(lambda x: match_full_name(x, full_name_list))

# Step 4: Merge the DataFrames on the matched name
merged_df = stats_2024.merge(batter_team_2024, left_on='matched_name', right_on='player', how='left')

In [16]:
#dropping the collumns we used to match to then match the other column
merged_df = merged_df.drop(columns=['short_name', 'matched_name'])

In [17]:
#matching the batter_team_2025 with the broken names in stats_2025 
# Step 1: Safely extract short names like "P. Sandoval"
def clean_name(raw):
    if not isinstance(raw, str):
        return None
    match = re.match(r"([A-Z]\.\s[A-Za-z'-]+)", raw)
    return match.group(1) if match else None

stats_2025['short_name'] = stats_2025['away_hitters'].apply(clean_name)

# Step 2: Match short name to full name in pitchers_team
def match_full_name(short_name, full_names):
    if not isinstance(short_name, str):
        return None

    last_name = short_name.split()[-1]
    full_names = [name for name in full_names if isinstance(name, str)]

    # Try to match by last name + same first initial
    possible_matches = [name for name in full_names if last_name in name]
    for name in possible_matches:
        if name[0] == short_name[0]:  # match first initial
            return name

    # Fallback to fuzzy match if nothing matched
    close = get_close_matches(short_name, full_names, n=1, cutoff=0.5)
    return close[0] if close else None

# Step 3: Apply the matching
full_name_list = batter_team_2025['player'].tolist()
stats_2025['matched_name'] = stats_2025['short_name'].apply(lambda x: match_full_name(x, full_name_list))

# Step 4: Merge the DataFrames on the matched name
merged_df_5 = stats_2025.merge(batter_team_2025, left_on='matched_name', right_on='player', how='left')

In [18]:
#dropping the collumns we used to match to then match the other column
merged_df_5 = merged_df_5.drop(columns=['short_name', 'matched_name'])

In [19]:
#now we are doing the same process except on the home players this time
# Step 1: Safely extract short names like "P. Sandoval"
def clean_name(raw):
    if not isinstance(raw, str):
        return None
    match = re.match(r"([A-Z]\.\s[A-Za-z'-]+)", raw)
    return match.group(1) if match else None

merged_df['short_name'] = merged_df['home_hitters'].apply(clean_name)

# Step 2: Match short name to full name in pitchers_team
def match_full_name(short_name, full_names):
    if not isinstance(short_name, str):
        return None

    last_name = short_name.split()[-1]
    full_names = [name for name in full_names if isinstance(name, str)]

    # Try to match by last name + same first initial
    possible_matches = [name for name in full_names if last_name in name]
    for name in possible_matches:
        if name[0] == short_name[0]:  # match first initial
            return name

    # Fallback to fuzzy match if nothing matched
    close = get_close_matches(short_name, full_names, n=1, cutoff=0.5)
    return close[0] if close else None

# Step 3: Apply the matching
full_name_list = batter_team_2024['player'].tolist()
merged_df['matched_name'] = merged_df['short_name'].apply(lambda x: match_full_name(x, full_name_list))

# Step 4: Merge the DataFrames on the matched name
merged_df_2 = merged_df.merge(batter_team_2024, left_on='matched_name', right_on='player', how='left')

In [20]:
#now we are doing the same process except on the home players this time
# Step 1: Safely extract short names like "P. Sandoval"
def clean_name(raw):
    if not isinstance(raw, str):
        return None
    match = re.match(r"([A-Z]\.\s[A-Za-z'-]+)", raw)
    return match.group(1) if match else None

merged_df_5['short_name'] = merged_df_5['home_hitters'].apply(clean_name)

# Step 2: Match short name to full name in pitchers_team
def match_full_name(short_name, full_names):
    if not isinstance(short_name, str):
        return None

    last_name = short_name.split()[-1]
    full_names = [name for name in full_names if isinstance(name, str)]

    # Try to match by last name + same first initial
    possible_matches = [name for name in full_names if last_name in name]
    for name in possible_matches:
        if name[0] == short_name[0]:  # match first initial
            return name

    # Fallback to fuzzy match if nothing matched
    close = get_close_matches(short_name, full_names, n=1, cutoff=0.5)
    return close[0] if close else None

# Step 3: Apply the matching
full_name_list = batter_team_2025['player'].tolist()
merged_df_5['matched_name'] = merged_df_5['short_name'].apply(lambda x: match_full_name(x, full_name_list))

# Step 4: Merge the DataFrames on the matched name
merged_df_6 = merged_df_5.merge(batter_team_2025, left_on='matched_name', right_on='player', how='left')

In [21]:
#dropping a lot of uneeded columns for simplicity
merged_df_2=merged_df_2[['player_x','team_x','away_R','player_y','team_y','home_R']]

In [22]:
merged_df_7=merged_df_6[['player_x','team_x','away_R','player_y','team_y','home_R']]

In [23]:
# Rename the column
def_rank_2024.rename(columns={'Rank': 'def_rank'}, inplace=True)

In [24]:
def_rank_2025.rename(columns={'Rank': 'def_rank'}, inplace=True)

In [25]:
team_mapping = {
    'Milwaukee Brewers': 'MIL',
    'Minnesota Twins': 'MIN',
    'Chicago Cubs': 'CHC',
    'Colorado Rockies': 'COL',
    'Washington Nationals': 'WSH',
    'Toronto Blue Jays': 'TOR',
    'Atlanta Braves': 'ATL',
    'Baltimore Orioles': 'BAL',
    'New York Yankees': 'NYY',
    'Philadelphia Phillies': 'PHI',
    'Pittsburgh Pirates': 'PIT',
    'Kansas City Royals': 'KCR',
    'Arizona Diamondbacks': 'ARI',
    'Boston Red Sox': 'BOS',
    'Detroit Tigers': 'DET',
    'Cleveland Guardians': 'CLE',
    'Tampa Bay Rays': 'TBR',
    'San Francisco Giants': 'SFG',
    'Los Angeles Dodgers': 'LAD',
    'Seattle Mariners': 'SEA',
    'Cincinnati Reds': 'CIN',
    'Texas Rangers': 'TEX',
    'Chicago White Sox': 'CHW',
    'Los Angeles Angels': 'LAA',
    'Miami Marlins': 'MIA',
    'New York Mets': 'NYM',
    'Athletics': 'OAK',
    'St. Louis Cardinals': 'STL',
    'San Diego Padres': 'SDP',
    'Houston Astros': 'HOU'
}

def_rank_2024['team'] = def_rank_2024['team'].map(team_mapping)
def_rank_2025['team'] = def_rank_2025['team'].map(team_mapping)

In [26]:
#making both types strings
merged_df_2.loc[:, 'team_x'] = merged_df_2['team_x'].astype(str)
def_rank_2024.loc[:, 'def_rank'] = def_rank_2024['def_rank'].astype(str)
def_rank_2025.loc[:, 'def_rank'] = def_rank_2025['def_rank'].astype(str)
merged_df_7.loc[:, 'team_x'] = merged_df_7['team_x'].astype(str)

 '17' '18' '19' '20' '21' '22' '23' '24' '25' '26' '27' '28' '29' '30']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  def_rank_2024.loc[:, 'def_rank'] = def_rank_2024['def_rank'].astype(str)
 '17' '18' '19' '20' '21' '22' '23' '24' '25' '26' '27' '28' '29' '30']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  def_rank_2025.loc[:, 'def_rank'] = def_rank_2025['def_rank'].astype(str)


In [27]:
#merge df_rank_2024 into merged_df_2 to bring in the def_rank 
merged_df_2 = merged_df_2.merge(
    def_rank_2024,
    how='left',
    left_on='team_x',
    right_on='team'
)
merged_df_2 = merged_df_2.drop(columns=['team'])

In [28]:
#merge df_rank_2025 into merged_df_7 to bring in the def_rank 
merged_df_7 = merged_df_7.merge(
    def_rank_2025,
    how='left',
    left_on='team_x',
    right_on='team'
)
merged_df_7 = merged_df_7.drop(columns=['team'])

In [29]:
#merging for home players def rank
merged_df_3 = merged_df_2.merge(
    def_rank_2024,
    how='left',
    left_on='team_y',
    right_on='team'
)
merged_df_3 = merged_df_3.drop(columns=['team'])

In [30]:
#merging for home players def rank
merged_df_8 = merged_df_7.merge(
    def_rank_2025,
    how='left',
    left_on='team_y',
    right_on='team'
)
merged_df_8 = merged_df_8.drop(columns=['team'])

In [31]:
#dropping uneeded created collumns from merging and making new df so we arent using slices of a df
df=merged_df_3[['player_x','team_x','away_R','def_rank_y']]
df2=merged_df_3[['player_y','team_y','home_R','def_rank_x']]
#2025
df3=merged_df_8[['player_x','team_x','away_R','def_rank_y']]
df4=merged_df_8[['player_y','team_y','home_R','def_rank_x']]

In [32]:
#dropping all the nan corows
df.dropna()
df2.dropna()
df3.dropna()
df4.dropna()

Unnamed: 0,player_y,team_y,home_R,def_rank_x
0,Austin Wells,NYY,1.0,21
1,Aaron Judge,NYY,0.0,21
2,Cody Bellinger,NYY,0.0,15
3,Paul Goldschmidt,NYY,0.0,21
4,Jazz Chisholm Jr.,NYY,0.0,21
...,...,...,...,...
4917,Jose Ramirez,CLE,0.0,13
4918,Carlos Santana,CLE,0.0,13
4919,Kyle Manzardo,CLE,0.0,13
4920,Bo Naylor,CLE,0.0,13


In [33]:
#renaming the columns of the away players 
df = df.rename(columns={
    'player_x': 'player',
    'team_x': 'team',
    'away_R': 'runs',
    'def_rank_y': 'def_rank'
})

In [34]:
#renaming the columns of the away players 
df3 = df3.rename(columns={
    'player_x': 'player',
    'team_x': 'team',
    'away_R': 'runs',
    'def_rank_y': 'def_rank'
})

In [35]:
#renaming the columns of the home players 

df2 = df2.rename(columns={
    'player_y': 'player',
    'team_y': 'team',
    'home_R': 'runs',
    'def_rank_x': 'def_rank'
})


In [36]:
#renaming the columns of the home players 

df4 = df4.rename(columns={
    'player_y': 'player',
    'team_y': 'team',
    'home_R': 'runs',
    'def_rank_x': 'def_rank'
})

In [37]:
#converting types to do math on it
# Convert 'hits' to numeric (errors='coerce' turns bad values into NaN)
df['runs'] = pd.to_numeric(df['runs'], errors='coerce')
df2['runs'] = pd.to_numeric(df2['runs'], errors='coerce')
df3['runs'] = pd.to_numeric(df3['runs'], errors='coerce')
df4['runs'] = pd.to_numeric(df4['runs'], errors='coerce')

In [38]:
#putting 1s in to indicate if they got 1 or 2 hits based off the column
df['1_runs_count'] = (df['runs'] > 0).astype(int)
df['2_runs_count'] = (df['runs'] > 1.5).astype(int)
df['3_runs_count'] = (df['runs'] > 2.5).astype(int)
df['4_runs_count'] = (df['runs'] > 3.5).astype(int)


df2['1_runs_count'] = (df2['runs'] > 0).astype(int)
df2['2_runs_count'] = (df2['runs'] > 1.5).astype(int)
df2['3_runs_count'] = (df2['runs'] > 2.5).astype(int)
df2['4_runs_count'] = (df2['runs'] > 3.5).astype(int)


#2025
df3['1_runs_count'] = (df3['runs'] > 0).astype(int)
df3['2_runs_count'] = (df3['runs'] > 1.5).astype(int)
df3['3_runs_count'] = (df3['runs'] > 2.5).astype(int)
df3['4_runs_count'] = (df3['runs'] > 3.5).astype(int)


df4['1_runs_count'] = (df4['runs'] > 0).astype(int)
df4['2_runs_count'] = (df4['runs'] > 1.5).astype(int)
df4['3_runs_count'] = (df4['runs'] > 2.5).astype(int)
df4['4_runs_count'] = (df4['runs'] > 3.5).astype(int)

In [39]:
df.dropna()
df2.dropna()
df3.dropna()
df4.dropna()

Unnamed: 0,player,team,runs,def_rank,1_runs_count,2_runs_count,3_runs_count,4_runs_count
0,Austin Wells,NYY,1.0,21,1,0,0,0
1,Aaron Judge,NYY,0.0,21,0,0,0,0
2,Cody Bellinger,NYY,0.0,15,0,0,0,0
3,Paul Goldschmidt,NYY,0.0,21,0,0,0,0
4,Jazz Chisholm Jr.,NYY,0.0,21,0,0,0,0
...,...,...,...,...,...,...,...,...
4917,Jose Ramirez,CLE,0.0,13,0,0,0,0
4918,Carlos Santana,CLE,0.0,13,0,0,0,0
4919,Kyle Manzardo,CLE,0.0,13,0,0,0,0
4920,Bo Naylor,CLE,0.0,13,0,0,0,0


In [40]:
#merging one row from df and then the next row from df2 and back and fourth
# Make sure both DataFrames have the same columns
df = df.reset_index(drop=True)
df2 = df2.reset_index(drop=True)
df3 = df3.reset_index(drop=True)
df4 = df4.reset_index(drop=True)
# Stack them row by row
interleaved = pd.concat([df, df2]).sort_index(kind='merge').reset_index(drop=True)

In [41]:
interleaved_2 = pd.concat([df3, df4]).sort_index(kind='merge').reset_index(drop=True)

In [42]:
interleaved_2.dropna()

Unnamed: 0,player,team,runs,def_rank,1_runs_count,2_runs_count,3_runs_count,4_runs_count
0,Jackson Chourio,MIL,0.0,10,0,0,0,0
1,Austin Wells,NYY,1.0,21,1,0,0,0
2,Christian Yelich,MIL,0.0,10,0,0,0,0
3,Aaron Judge,NYY,0.0,21,0,0,0,0
4,Willson Contreras,STL,0.0,10,0,0,0,0
...,...,...,...,...,...,...,...,...
9839,Kyle Manzardo,CLE,0.0,13,0,0,0,0
9840,Max Kepler,PHI,0.0,22,0,0,0,0
9841,Bo Naylor,CLE,0.0,13,0,0,0,0
9844,Alec Bohm,PHI,0.0,22,0,0,0,0


In [43]:
interleaved.dropna()

Unnamed: 0,player,team,runs,def_rank,1_runs_count,2_runs_count,3_runs_count,4_runs_count
0,Shea Langeliers,,0.0,27,0,0,0,0
2,Tyler Soderstrom,,0.0,27,0,0,0,0
4,Daz Cameron,,0.0,25,0,0,0,0
6,Armando Alvarez,,0.0,27,0,0,0,0
8,Zack Gelof,,0.0,27,0,0,0,0
...,...,...,...,...,...,...,...,...
34349,Drew Millas,WSH,0.0,11,0,0,0,0
34350,Edmundo Sosa,PHI,1.0,23,1,0,0,0
34351,Jacob Young,WSH,1.0,11,1,0,0,0
34352,Garrett Stubbs,PHI,1.0,23,1,0,0,0


In [44]:
#making def_rank column numeric to use to split data
interleaved['def_rank'] = pd.to_numeric(interleaved['def_rank'], errors='coerce')
interleaved_2['def_rank'] = pd.to_numeric(interleaved_2['def_rank'], errors='coerce')

In [45]:
#unders are only against top 15 teams and overs are only against bot 15 teams, so we split then to get the overs and unders df
unders = interleaved[interleaved['def_rank'] < 16].reset_index(drop=True)
overs = interleaved[interleaved['def_rank'] > 15].reset_index(drop=True)
#2025
unders_2025 = interleaved_2[interleaved_2['def_rank'] < 16].reset_index(drop=True)
overs_2025 = interleaved_2[interleaved_2['def_rank'] > 15].reset_index(drop=True)

In [46]:
#dropping nans
unders.dropna()
overs.dropna()
unders_2025.dropna()
overs_2025.dropna()

Unnamed: 0,player,team,runs,def_rank,1_runs_count,2_runs_count,3_runs_count,4_runs_count
0,Austin Wells,NYY,1.0,21.0,1,0,0,0
1,Aaron Judge,NYY,0.0,21.0,0,0,0,0
2,Paul Goldschmidt,NYY,0.0,21.0,0,0,0,0
3,Jazz Chisholm Jr.,NYY,0.0,21.0,0,0,0,0
4,Jasson Dominguez,NYY,0.0,21.0,0,0,0,0
...,...,...,...,...,...,...,...,...
3113,Bryce Harper,PHI,0.0,22.0,0,0,0,0
3114,Kyle Schwarber,PHI,2.0,22.0,1,1,0,0
3115,Nick Castellanos,PHI,0.0,22.0,0,0,0,0
3116,Max Kepler,PHI,0.0,22.0,0,0,0,0


In [47]:
#stacking overs on top of overs_2025 to make sure they are in order to count freq
stacked_overs = pd.concat([overs, overs_2025], ignore_index=True)

In [48]:
#stacking unders on top of unders_2025 to make sure they are in order to count freq
stacked_unders = pd.concat([unders, unders_2025], ignore_index=True)

In [49]:
#making games collumn
stacked_overs['games_played'] = 1
stacked_unders['games_played'] = 1

In [50]:
stacked_overs.dropna()
stacked_unders.dropna()

Unnamed: 0,player,team,runs,def_rank,1_runs_count,2_runs_count,3_runs_count,4_runs_count,games_played
0,Andrew McCutchen,PIT,0.0,2.0,0,0,0,0,1
1,Bryan Reynolds,PIT,0.0,2.0,0,0,0,0,1
2,Connor Joe,PIT,0.0,2.0,0,0,0,0,1
4,Edward Olivares,PIT,0.0,2.0,0,0,0,0,1
5,Nick Gonzales,PIT,0.0,2.0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
15929,Jose Ramirez,CLE,0.0,13.0,0,0,0,0,1
15930,Carlos Santana,CLE,0.0,13.0,0,0,0,0,1
15931,Kyle Manzardo,CLE,0.0,13.0,0,0,0,0,1
15932,Bo Naylor,CLE,0.0,13.0,0,0,0,0,1


In [51]:
#making a new df that sums the 1_hit_count and 2_hit_count and games_played and then creates the season frequency for overs
overs_freq_2 = stacked_overs.groupby('player', as_index=False).agg({
    '1_runs_count': 'sum',
    '2_runs_count': 'sum',
    '3_runs_count': 'sum',
    '4_runs_count': 'sum',
    'games_played': 'sum'
})

# Add a column for hit rate aka freq
overs_freq_2['1_runs_rate'] = overs_freq_2['1_runs_count'] / overs_freq_2['games_played']
overs_freq_2['2_runs_rate'] = overs_freq_2['2_runs_count'] / overs_freq_2['games_played']
overs_freq_2['3_runs_rate'] = overs_freq_2['3_runs_count'] / overs_freq_2['games_played']
overs_freq_2['4_runs_rate'] = overs_freq_2['4_runs_count'] / overs_freq_2['games_played']

In [52]:
#dropping all players that do not have at least 15 games played in this category
overs_freq_2 = overs_freq_2[overs_freq_2['games_played'] > 14].reset_index(drop=True)

In [53]:
#making a new df that sums the 1_hit_count and 2_hit_count and games_played and then creates the season frequency for unders 
unders_freq_2 = stacked_unders.groupby('player', as_index=False).agg({
    '1_runs_count': 'sum',
    '2_runs_count': 'sum',
    '3_runs_count': 'sum',
    '4_runs_count': 'sum',
    'games_played': 'sum'
})

# Add a column for hit rate
unders_freq_2['1_runs_rate'] = unders_freq_2['1_runs_count'] / unders_freq_2['games_played']
unders_freq_2['2_runs_rate'] = unders_freq_2['2_runs_count'] / unders_freq_2['games_played']
unders_freq_2['3_runs_rate'] = unders_freq_2['3_runs_count'] / unders_freq_2['games_played']
unders_freq_2['4_runs_rate'] = unders_freq_2['4_runs_count'] / unders_freq_2['games_played']


In [54]:
#dropping all players that do not have at least 10 games played in this category
unders_freq_2 = unders_freq_2[unders_freq_2['games_played'] > 14].reset_index(drop=True)

In [55]:
#checking if distrobition is correct, merging them on name to make sure they average less hits against better pitchers and vise versa
final_freq4 = overs_freq_2.merge(
    unders_freq_2,
    on='player',
    how='inner',
    suffixes=('_over', '_under')
)

In [56]:
#dropping all rows that do not meet the creteria: avg less hits against better teams, and more against worse teams
final_freq4 = final_freq4[final_freq4['1_runs_rate_under'] <= final_freq4['1_runs_rate_over']].reset_index(drop=True)

In [57]:
#making the final dataframes before sending them to the final script with odds
final_over_df=final_freq4[['player','1_runs_rate_over','2_runs_rate_over','3_runs_rate_over','4_runs_rate_over']]
final_under_df=final_freq4[['player','1_runs_rate_under','2_runs_rate_under','3_runs_rate_under','4_runs_rate_under']]

In [58]:
final_over_df

Unnamed: 0,player,1_runs_rate_over,2_runs_rate_over,3_runs_rate_over,4_runs_rate_over
0,Aaron Judge,0.591837,0.265306,0.040816,0.020408
1,Adam Frazier,0.259259,0.074074,0.000000,0.000000
2,Adley Rutschman,0.440000,0.100000,0.000000,0.000000
3,Adolis Garcia,0.404762,0.047619,0.000000,0.000000
4,Alec Bohm,0.432432,0.027027,0.000000,0.000000
...,...,...,...,...,...
225,Yandy Diaz,0.372093,0.093023,0.011628,0.000000
226,Yordan Alvarez,0.409091,0.090909,0.022727,0.000000
227,Zach McKinstry,0.480000,0.000000,0.000000,0.000000
228,Zach Neto,0.400000,0.100000,0.000000,0.000000


In [59]:
final_under_df

Unnamed: 0,player,1_runs_rate_under,2_runs_rate_under,3_runs_rate_under,4_runs_rate_under
0,Aaron Judge,0.525000,0.150000,0.025000,0.025
1,Adam Frazier,0.150000,0.000000,0.000000,0.000
2,Adley Rutschman,0.264706,0.000000,0.000000,0.000
3,Adolis Garcia,0.212121,0.060606,0.000000,0.000
4,Alec Bohm,0.268293,0.073171,0.000000,0.000
...,...,...,...,...,...
225,Yandy Diaz,0.289855,0.043478,0.000000,0.000
226,Yordan Alvarez,0.303030,0.151515,0.030303,0.000
227,Zach McKinstry,0.388889,0.000000,0.000000,0.000
228,Zach Neto,0.387097,0.064516,0.000000,0.000


In [60]:
final_over_df.to_csv('over_runs_rate.csv')

In [61]:
final_under_df.to_csv('under_runs_rate.csv')

In [62]:
print('done')

done
