# New version validations
The purpose of this notebook is to check the subject IDs of new versions of studies. This is to ensure that the subject IDs are not changing, which could cause problems with genomic data or incorrect patient mapping in PIC-SURE.

### Prerequisites
- Access to the S3 bucket
- Files from new study version downloaded via "Pull raw data from gen3" Jenkins job

In [None]:
import pandas as pd
from check_version_utils import check_new_version

In [None]:
# Change directory to the directories with files of interest
old_dir = '/home/ec2-user/SageMaker/studies/ALL-avillach-73-bdcatalyst-etl/chs/rawDataOld/' # old version files
new_dir = '/home/ec2-user/SageMaker/studies/ALL-avillach-73-bdcatalyst-etl/chs/rawData/' # newly downloaded file versions

### Comparing Subject_MULTI files

In [None]:
potential_cols = ['INDIVIDUAL_ID']

In [None]:
old_sub_multi = old_dir+'phs000287.v6.pht001447.v4.p1.CHS_Subject.MULTI.txt'
new_sub_multi = new_dir+'phs000287.v7.pht001447.v4.p1.CHS_Subject.MULTI.txt'

In [None]:
old_diffs, new_diffs = check_new_version(old_sub_multi, new_sub_multi, potential_cols)

In [None]:
osm = pd.read_csv(old_sub_multi, sep = '\t', skiprows=10)
nsm = pd.read_csv(new_sub_multi, sep = '\t', skiprows=10)

In [None]:
osm

In [None]:
nsm

In [None]:
fudge_data = pd.DataFrame([[5, 6, 7, 8, 9], [7, 8, 9, 10, 11]], columns=osm.columns)
fudge_data

In [None]:
osm = osm.append(fudge_data, ignore_index = True)
osm

In [None]:
for col in osm.columns:
    if col.upper() in potential_cols:
        #print(col)
        compare_col = col
if compare_col not in nsm.columns:
    print("Manual inspection of columns needed")
else:
    print(compare_col)

In [None]:
in_new_not_old = list(set(nsm[compare_col])-set(osm[compare_col]))
in_old_not_new =list(set(osm[compare_col])-set(nsm[compare_col]))

In [None]:
if len(in_new_not_old) == 0:
    print("All", compare_col, "in new version are in old")
else:
    print("There are", len(in_new_not_old), "IDs in new version not in old")
if len(in_old_not_new) == 0:
    print("All", compare_col, "in old version are in new")
else:
    print("There are", len(in_old_not_new), "IDs in old version not in new")

In [None]:
# Code to investigate the IDs that are different between the version
old_diffs = osm[osm[compare_col].isin(in_old_not_new)]
if old_diffs.shape[0] != 0:
    print("Rows in old not in new:\n")
    print(old_diffs)
new_diffs = nsm[nsm[compare_col].isin(in_new_not_old)]
if new_diffs.shape[0] != 0:
    print("Rows in new not in old:\n")
    print(new_diffs)

### Comparing Sample_MULTI files

In [None]:
potential_cols = ['SAMPID', 'SAMPLE_ID', 'SAMPLEID']

In [None]:
old_sam_multi = old_dir+'phs000287.v6.pht001448.v6.p1.CHS_Sample.MULTI.txt'
new_sam_multi = new_dir+'phs000287.v7.pht001448.v7.p1.CHS_Sample.MULTI.txt'

In [None]:
old_diffs, new_diffs = check_new_version(old_sam_multi, new_sam_multi, potential_cols)

In [None]:
oam = pd.read_csv(old_sam_multi, sep = '\t', skiprows=10)
nam = pd.read_csv(new_sam_multi, sep = '\t', skiprows=10)

In [None]:
for col in oam.columns:
    if col.upper() in potential_cols:
        #print(col)
        compare_col = col
if compare_col not in nam.columns:
    print("Manual inspection of columns needed")
else:
    print(compare_col)

In [None]:
in_new_not_old = list(set(nam[compare_col])-set(oam[compare_col]))
in_old_not_new =list(set(oam[compare_col])-set(nam[compare_col]))

In [None]:
if len(in_new_not_old) == 0:
    print("All", compare_col, "in new version are in old")
else:
    print("There are", len(in_new_not_old), "IDs in new version not in old")
if len(in_old_not_new) == 0:
    print("All", compare_col, "in old version are in new")
else:
    print("There are", len(in_old_not_new), "IDs in old version not in new")

In [None]:
# Code to investigate the IDs that are different between the version
old_diffs = oam[oam[compare_col].isin(in_old_not_new)]
if old_diffs.shape[0] != 0:
    print("Rows in old not in new:\n")
    print(old_diffs)
new_diffs = nam[nam[compare_col].isin(in_new_not_old)]
if new_diffs.shape[0] != 0:
    print("Rows in new not in old:\n")
    print(new_diffs)