In [5]:
import ast
import pandas as pd
from connections import AWS
from sklearn.model_selection import train_test_split

$\textbf{Epidemiology: Clinical Train/Test Splits}$

Separate cohort of matches into train and validation sets for model development. 

In [3]:
# setup AWS connection
aws = AWS()
aws.connect()

[AWS]: Port 5433 is free.
[AWS]: Connected to RDS endpoint.


In [6]:
# load/collect all matches
n_matches = 5
cohort_matches = aws.load_s3_object(f'epidemiology/cohorts/injured/pitcher_info/matches_{n_matches}_per_pitcher.csv')

# organize all IDs w/ season
cohort_info = []
for _, row in cohort_matches.iterrows():
    cohort_info.append({
        'pitcher': row['mlbamid_injured'],
        'season': row['season'],
        'injured': 1
    })

    # append all non-injured pitchers
    for mlbamid in ast.literal_eval(row['mlbamid_noninjured']):
        cohort_info.append({
            'pitcher': mlbamid,
            'season': row['season'],
            'injured': 0
        })

# concatenate all pitcher info
cohort_info = pd.DataFrame(cohort_info)

In [9]:
# get train/test split (test size: 25%)
    # NOTE: this preserves matching pairs within the split
train_ids, test_ids = train_test_split(
    cohort_info, 
    test_size=0.25, 
    random_state=22, 
    stratify=cohort_info['injured']
)


In [14]:
# upload to S3
path_stem = 'epidemiology/ml/datasets/full'
aws.upload_to_s3(train_ids, f'{path_stem}/cohort_train_ids.csv')
aws.upload_to_s3(test_ids, f'{path_stem}/cohort_test_ids.csv')

[AWS]: Uploaded object to s3://pitch-ml/epidemiology/ml/datasets/full/cohort_train_ids.csv
[AWS]: Uploaded object to s3://pitch-ml/epidemiology/ml/datasets/full/cohort_test_ids.csv


In [15]:
# close AWS connection
aws.close()

[AWS]: Database connection closed.
[AWS]: SSH tunnel stopped.
