# New version validations
The purpose of this notebook is to check the subject IDs of new versions of studies. This is to ensure that the subject IDs are not changing, which could cause problems with genomic data or incorrect patient mapping in PIC-SURE.

### Prerequisites
- Access to the S3 bucket
- Files from new study version downloaded via "Pull raw data from gen3" Jenkins job

In [None]:
import pandas as pd
from check_version_utils import check_new_version, check_new_df

In [None]:
# Change directory to the directories with files of interest
old_dir = '/home/ec2-user/SageMaker/studies/ALL-avillach-73-bdcatalyst-etl/whi/rawDataOld/' # old version files
new_dir = '/home/ec2-user/SageMaker/studies/ALL-avillach-73-bdcatalyst-etl/whi/rawData/' # newly downloaded file versions

### Comparing Subject_MULTI files

In [None]:
# Check all columns of the subject_multi file 

In [None]:
subject_cols = ['INDIVIDUAL_ID', 'SUBJID']
exclude_cols = ['DBGAP_SUBJECT_ID']

In [None]:
old_sub_multi = old_dir+'phs000200.v11.pht000982.v8.p3.WHI_Subject.MULTI.txt'
new_sub_multi = new_dir+'phs000200.v12.pht000982.v8.p3.WHI_Subject.MULTI.txt'

In [None]:
old_diffs, new_diffs = check_new_version(old_sub_multi, new_sub_multi, subject_cols)

In [None]:
old_data, new_data = check_new_df(old_sub_multi, new_sub_multi, include_cols=None, 
             exclude_cols = exclude_cols, old_diffs=old_diffs, new_diffs=new_diffs)

In [None]:
# Manual inspection of dataframes
#old = pd.read_csv(old_sub_multi, sep = '\t', skiprows=10)
#new = pd.read_csv(new_sub_multi, sep = '\t', skiprows=10)
#old
#new

### Comparing Sample_MULTI files

In [None]:
# Compare the subject_ID and the sample_ID should match for each row

In [None]:
sample_cols = ['SAMPID', 'SAMPLE_ID', 'SAMPLEID']
include_cols = sample_cols+subject_cols

In [None]:
old_sam_multi = old_dir+'phs000200.v11.pht001032.v8.p3.WHI_Sample.MULTI.txt'
new_sam_multi = new_dir+'phs000200.v12.pht001032.v9.p3.WHI_Sample.MULTI.txt'

In [None]:
old_diffs, new_diffs = check_new_version(old_sam_multi, new_sam_multi, sample_cols)

In [None]:
new_diffs

In [None]:
old_data, new_data = check_new_df(old_sam_multi, new_sam_multi, include_cols=include_cols, 
             exclude_cols = None, old_diffs=old_diffs, new_diffs=new_diffs)