In [None]:
import pandas as pd
from connections import AWS

$\textbf{Epidemiology: Create Final Cohort with Matches}$

In [None]:
""" INITIALIZE AWS CONNECTION """
aws = AWS()
aws.connect()

In [None]:
# load cohort of matches
cohort_matches = aws.load_s3_object('epidemiology/cohorts/injured/pitcher_info/matches_0825.csv')

# join injury metadata to filter out non-tissue injuries
cohort_metadata = aws.load_s3_object('epidemiology/cohorts/injured/pitcher_info/pitchers_0825.csv')
cohort_metadata['season'] = cohort_metadata['injury_date'].str[0:4].astype(int)
injury_info = cohort_metadata.merge(cohort_matches, right_on=['mlbamid_injured', 'season'], left_on=['mlbamid', 'season'], how='inner')[['mlbamid', 'injury_type']]

# filter out non-tissue injuries
    # --> 209 injuries remaining
valid_injuries = [
    'tommy_john_surgery', 
    'elbow_surgery_(internal_brace)', 
    'elbow_surgery', 
    'arthroscopic_elbow_surgery', 
    'elbow_surgery_(ucl)', 
    'tommy_john_surgery_(internal_brace)'
]
valid_injury_info = injury_info[injury_info['injury_type'].isin(valid_injuries)].reset_index(drop=True)
cohort_matches_final = cohort_matches[cohort_matches['mlbamid_injured'].isin(valid_injury_info['mlbamid'])].reset_index(drop=True)

# upload final cohort matches to S3
aws.upload_to_s3(cohort_matches_final, 'epidemiology/ml/datasets/full/cohort_matches_final.csv')

In [None]:
# close AWS connection
aws.close()