# load packages

In [None]:
import pandas as pd

# make liftover input

## read in sumstats

In [None]:
sumstats = pd.read_csv('formatted_invnormTSH_overall_130421_invvar1.txt-QCfiltered_GC.txt.gz',
                       sep = ' ')
print(sumstats.shape)
sumstats.head()

## clean sumstats
- add chromosome and position columns
- remove SNP column
- make allele columns upper case

In [None]:
sumstats[['CHR', 'POS', 'SNP']] = sumstats['MarkerName'].str.split(':', expand = True)
sumstats['Allele1'] = sumstats['Allele1'].str.upper()
sumstats['Allele2'] = sumstats['Allele2'].str.upper()
sumstats = sumstats.drop(columns = ['SNP'])
sumstats.head()

## format to liftover input

In [None]:
liftover_input = sumstats[['CHR', 'POS']]
liftover_input['POS_2'] = liftover_input['POS'].astype(int) + 1
liftover_input['CHR'] = 'chr' + liftover_input['CHR']
liftover_input.head()

## export sumstats and liftover input

In [None]:
sumstats.to_csv('formatted_invnormTSH_overall_130421_invvar1.txt-QCfiltered_GC.clean.txt.gz',
                sep = '\t',
                compression = 'gzip',
                index = None)

In [None]:
liftover_input.to_csv('formatted_invnormTSH_overall_130421_invvar1.txt-QCfiltered_GC.liftover_input.txt',
                      sep = ' ',
                      header = False,
                      index = None)

# clean liftover output

## read in liftover output

In [None]:
liftover_output = pd.read_csv('formatted_invnormTSH_overall_130421_invvar1.txt-QCfiltered_GC.liftover_output.txt',
                              sep = '\t',
                              header = None)
liftover_output.head()

In [None]:
liftover_failed = pd.read_csv('formatted_invnormTSH_overall_130421_invvar1.txt-QCfiltered_GC.liftover_failed.txt',
                              sep = '\t',
                              comment = '#',
                              header = None)
liftover_failed.head()

## clean liftover output

In [None]:
liftover_failed[0] = liftover_failed[0].str.replace('chr', '')
liftover_failed[0] = liftover_failed[0].str.strip()
liftover_failed[1] = liftover_failed[1].astype(str).str.strip()
liftover_failed.head()

## remove extra whitespace from sumstats

In [None]:
sumstats['CHR'] = sumstats['CHR'].str.strip()
sumstats['POS'] = sumstats['POS'].str.strip()

## set index

In [None]:
sumstats.set_index(['CHR', 'POS'], drop = False, inplace = True)
sumstats.head()

In [None]:
liftover_failed.set_index([0, 1], inplace = True)
liftover_failed.head()

## filter out failed indexes

In [None]:
sumstats_b38 = sumstats[~sumstats.index.isin(liftover_failed.index)]
print(len(sumstats.index))
print(len(sumstats_b38.index))
print(len(liftover_failed.index))

## add new coordinates

In [None]:
print(len(sumstats_b38.index))
sumstats_b38 = pd.concat([sumstats_b38.reset_index(drop = True), liftover_output.reset_index(drop = True)], axis = 1)
print(len(sumstats_b38.index))                         
sumstats_b38.head()

In [None]:
sumstats_b38_clean = sumstats_b38.drop(columns = ['CHR', 'POS', 2, 'MarkerName'])
sumstats_b38_clean = sumstats_b38_clean.rename(columns = {0 : 'CHR', 1 : 'POS'})
sumstats_b38_clean['CHR'] = sumstats_b38_clean['CHR'].str.replace('chr', '')
sumstats_b38_clean = sumstats_b38_clean[~sumstats_b38_clean['CHR'].str.contains('KI|GL|Un')]
print(sumstats_b38_clean['CHR'].unique())
print(len(sumstats_b38_clean.index))
sumstats_b38_clean.head()

## export b38 sumstats

In [None]:
sumstats_b38_clean.to_csv('formatted_invnormTSH_overall_130421_invvar1.txt-QCfiltered_GC.clean.b38.txt.gz',
                          sep = '\t',
                          compression = 'gzip',
                          index = None)