# Derive specific genotypings and genetic risk score

Created on 2019/03/04   
By Hirotaka Iwaki    

Specific genotyping and summarized score is derived from the most recent GWAS meta-analysis at https://doi.org/10.1101/388165 (M. Nalls et.al)

##  Genetic risk score
### Create a folder in /lscratch and set as working directory

In [1]:
import pandas as pd
import numpy as np
import os
import subprocess
import glob
from functools import partial
from os import chdir
import io
from IPython.display import FileLinks, FileLink
import datetime

tmp = '/lscratch/' + os.environ['SLURM_JOB_ID'] + '/GRS'
if os.path.exists(tmp):
    print(tmp, 'exists')
else:
    os.mkdir(tmp)
    print('Make', tmp)

chdir(tmp)
    
def submitTerminal(command, printing=False):
    # quick command to submit jobs to terminal
    res=subprocess.run(command.split(' '), stdout=subprocess.PIPE)
    if printing:
        print(res.stdout.decode('utf-8'))
    else:
        return(res.stdout.decode('utf-8'))
def read_vcf(path):
    with open(path, 'r') as f:
        lines = [l for l in f if not l.startswith('##')]
    return pd.read_table(
        io.StringIO(''.join(lines)),
        dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
               'QUAL': str, 'FILTER': str, 'INFO': str}
    ).rename(columns={'#CHROM': 'CHROM'})

FileLink = partial(FileLink, url_prefix=tmp)
FileLinks = partial(FileLinks, url_prefix=tmp)

/lscratch/21965786/GRS exists


In [2]:
df = pd.read_csv("/data/LNG/iwakih2/tool/meta5/Meta5new.csv")
risk90 = df[df['Failed final filtering and QC']==0][['SNP', 'Effect allele', 'Beta, all studies']]
risk90['Effect allele'] = risk90['Effect allele'].str.upper()
risk90.to_csv('score.txt', header=False, index=False, sep='\t')

In [3]:
sPlinkScore = 'plink --bfile {B} --score {S} --out {O}'
COMMAND = sPlinkScore.format(B="/data/LNG/PPMI_WGS/july_2018/PPMI_july2018",
                  S = 'score.txt',
                  O = 'score')
t = submitTerminal(COMMAND)

In [4]:
# create genotypes
sPlinkVariants = 'plink --bfile {B} --extract {E} --make-bed --out {O}'
COMMAND = sPlinkVariants.format(B="/data/LNG/PPMI_WGS/july_2018/PPMI_july2018",
                  E = 'score.txt',
                  O = 'risk')
sPlinkRecode = 'plink --bfile {B} --recode {R} --out {O}'
COMMAND = sPlinkRecode.format(B="risk", R = 'vcf', O = 'recode')
t = submitTerminal(COMMAND)

## Create report file 1
Derive necessary information from files above
### Score

In [5]:
df = pd.read_table('score.profile', delim_whitespace=True)

In [6]:
score=pd.DataFrame()
score[['PATNO', 'TESTVALUE']]=df[['IID', 'SCORE']]
score['TESTNAME'] = 'GRS'
score.head()
score.to_csv('score.csv', index=False)

### Genotype

In [7]:
snps = read_vcf('recode.vcf')
snps = snps.drop(columns = ['CHROM', 'POS', 'QUAL', 'FILTER', 'INFO', 'FORMAT'])

In [8]:
df = snps.set_index(['ID', 'REF', 'ALT']).stack().reset_index()
df.columns=['TESTNAME', 'REF', 'ALT', 'ID', 'GENOTYPE']


In [9]:
def convertResult(x):
    if x.GENOTYPE=='0/0':
        ans = x.REF + '/' + x.REF
    elif x.GENOTYPE=='0/1':
        ans = x.REF + '/' + x.ALT
    elif x.GENOTYPE=='1/1':
        ans = x.ALT + '/' + x.ALT
    else:
        ans = np.nan
    return ans
df['TESTVALUE'] = df.apply(convertResult, axis=1)
df['PATNO'] = df.ID.str.split('_', expand=True)[0]

In [10]:
df[['PATNO', 'TESTNAME', 'TESTVALUE']].to_csv('genotype.csv', index=False)

### Combine two datasets

In [11]:
df1 = pd.read_csv('genotype.csv')
df2 = pd.read_csv('score.csv')
df = pd.concat([df1, df2], ignore_index=True, sort=True)
df = df.sort_values(['PATNO'])
df.to_csv('resuls.csv', index=False)

**Then combine with the barcode table**    
The data should contain barcode, not PATNO    
matching table is created in another script

In [13]:
df1 = pd.read_csv('resuls.csv')
df1.index = df1['PATNO'].str.replace('PPMISI', '') # change index for join

df2 = pd.read_csv('/data/LNG/iwakih2/dataset/PPMI/WGS/Shipping/MatchingTable.csv', index_col='PATNO')
df2 = df2[pd.isnull(df2.Unshipped)] # delete unshipped
df2 = df2[['Reporting_ID']] # Only use this column

df = df1.join(df2, how='inner')
df = df[pd.notna(df.Reporting_ID)]
df['PROJECTID'] = np.nan
df['UNITS'] = np.nan
df['SAMPLEID'] = df['Reporting_ID']
df['RUNDATE'] = datetime.datetime.now().strftime("%d/%m/%Y")

In [14]:
df.shape

(122486, 8)

In [17]:
df[['PROJECTID', 'SAMPLEID', 'TESTNAME', 'TESTVALUE', 'UNITS', 'RUNDATE']].\
  to_csv('WGS_report.csv', index=False)

In [18]:
!zip WGS_report.zip WGS_report.csv

  adding: WGS_report.csv (deflated 89%)


In [19]:
!cp WGS_report.zip /data/LNG/iwakih2/dataset/PPMI/WGS/reports/

In [16]:
FileLinks('.')