In [1]:
import pandas as pd
import argparse as args
import numpy as np

#%load_ext rpy2.ipython

# Format required for LD-Score Regression:

GWAS data should be whitespace-delimited text, with one row per SNP and with a header row. 

#### Required columns:
1. `SNP` -- SNP identifier (e.g., rs number)
2. `N` -- sample size (which may vary from SNP to SNP).
3. `Z` -- z-score. Sign with respect to A1 (warning, possible gotcha)
4. `A1` -- first allele (effect allele)
5. `A2` -- second allele (other allele)


Column order does not matter. 

Note that `ldsc` filters out all variants that are not SNPs and strand-ambiguous SNPs.

# Educational attainment formatting:

#### Reading the data:

In [2]:
education = pd.DataFrame(pd.read_csv("~/alzheimersproject/1_raw_data/education_okbay", 
                delim_whitespace=True, header=0, na_values='NA'))

education

Unnamed: 0,rsID,Chr,BP,Effect_allele,Other_allele,EAF_HRC,Beta,SE,SE_unadj,P,P_unadj
0,rs667647,5,29439275,T,C,0.376548,-0.00032,0.00179,0.00167,0.86000,0.85040
1,rs113534962,5,85928892,T,C,0.074700,-0.00105,0.00359,0.00336,0.76960,0.75420
2,rs559397866,2,170966953,T,C,0.989375,0.00724,0.00939,0.00879,0.44090,0.41000
3,rs2366866,10,128341232,T,C,0.464660,0.00336,0.00174,0.00163,0.05336,0.03888
4,rs540077909,5,46391045,A,C,0.998845,0.01236,0.02326,0.02176,0.59520,0.57000
...,...,...,...,...,...,...,...,...,...,...,...
10985942,rs2042186,5,95076854,A,G,0.936280,0.00199,0.00351,0.00328,0.56980,0.54350
10985943,rs148163863,2,101784065,A,G,0.995842,-0.00067,0.01499,0.01402,0.96430,0.96180
10985944,rs144940089,9,134128772,A,G,0.005282,-0.01762,0.01279,0.01196,0.16830,0.14080
10985945,rs9666475,11,99622387,T,C,0.780875,-0.00477,0.00223,0.00209,0.03255,0.02230


Z and N columns are missing.

#### Adding Z and N column:

In [4]:
%%bash

awk -v OFS='\t' 'BEGIN { print "SNP", "CHR", "BP", "A1", "A2", "FREQ", "BETA", "SE", "SE_unadj", "P", "P_unadj", "Z", "N" } NR>2 { print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12 = ($7/$8), $13 = (765283) }' ~/alzheimersproject/1_raw_data/education_okbay > education_z_n


#### Removing potential non-rsIDs in the SNP column:

In [5]:
%%bash

awk -v OFS='\t' 'NR == 1 || $1 ~ "rs"' education_z_n > education_onlyrs

#### Removing possible duplicates i the file:

In [6]:
%%bash

awk '!seen[$1]++' education_onlyrs > education_onlyrs_uniq

#### Checking if it worked:

In [7]:
education = pd.DataFrame(pd.read_csv("~/alzheimersproject/2_formatting/education_onlyrs_uniq", 
                delim_whitespace=True, header=0, na_values='NA'))

education

Unnamed: 0,SNP,CHR,BP,A1,A2,FREQ,BETA,SE,SE_unadj,P,P_unadj,Z,N
0,rs113534962,5,85928892,T,C,0.074700,-0.00105,0.00359,0.00336,0.76960,0.75420,-0.292479,765283
1,rs559397866,2,170966953,T,C,0.989375,0.00724,0.00939,0.00879,0.44090,0.41000,0.771033,765283
2,rs2366866,10,128341232,T,C,0.464660,0.00336,0.00174,0.00163,0.05336,0.03888,1.931030,765283
3,rs540077909,5,46391045,A,C,0.998845,0.01236,0.02326,0.02176,0.59520,0.57000,0.531384,765283
4,rs472303,3,62707519,T,C,0.061056,-0.00748,0.00372,0.00348,0.04447,0.03168,-2.010750,765283
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10985941,rs2042186,5,95076854,A,G,0.936280,0.00199,0.00351,0.00328,0.56980,0.54350,0.566952,765283
10985942,rs148163863,2,101784065,A,G,0.995842,-0.00067,0.01499,0.01402,0.96430,0.96180,-0.044697,765283
10985943,rs144940089,9,134128772,A,G,0.005282,-0.01762,0.01279,0.01196,0.16830,0.14080,-1.377640,765283
10985944,rs9666475,11,99622387,T,C,0.780875,-0.00477,0.00223,0.00209,0.03255,0.02230,-2.139010,765283


#### Checking for NaN, NA, inf:

In [8]:
# 1. Checking for NaN:
print(education.isnull().sum())

print('\n------------------------------\n')

# 2. Checking for NA:
print(education.isna().sum())

print('\n------------------------------\n')

# 3. Checking for inf:
print((education.isin([np.inf, -np.inf])).sum())

SNP         0
CHR         0
BP          0
A1          0
A2          0
FREQ        0
BETA        0
SE          0
SE_unadj    0
P           0
P_unadj     0
Z           0
N           0
dtype: int64

------------------------------

SNP         0
CHR         0
BP          0
A1          0
A2          0
FREQ        0
BETA        0
SE          0
SE_unadj    0
P           0
P_unadj     0
Z           0
N           0
dtype: int64

------------------------------

SNP         0
CHR         0
BP          0
A1          0
A2          0
FREQ        0
BETA        0
SE          0
SE_unadj    0
P           0
P_unadj     0
Z           0
N           0
dtype: int64


In [9]:
%%bash

# Checking for P=0:
awk -v OFS='\t' 'NR == 1 || $10 == "0.0" || $10 == "0"' education_onlyrs_uniq | head

SNP	CHR	BP	A1	A2	FREQ	BETA	SE	SE_unadj	P	P_unadj	Z	N


#### Rearranging columns:

In [2]:
%%bash

awk -v OFS='\t' 'BEGIN { print "SNP", "CHR", "BP", "A1", "A2", "Z", "P", "N", "FREQ", "BETA", "SE" } NR!=1 { print $1, $2, $3, $4, $5, $12, $10, $13, $6, $7, $8 }' education_onlyrs_uniq > education_formatted

#### Rearranging columns for ldsc:

In [10]:
%%bash

awk -v OFS='\t' 'BEGIN { print "SNP", "CHR", "BP", "A1", "A2", "FREQ", "P", "N", "Z", "BETA", "SE" } NR!=1 { print $1, $2, $3, $4, $5, $6, $10, $13, $12 }' education_onlyrs_uniq > ~/alzheimersproject/ldsc_formatted/education_ldsc_form

#### Rearranging columns for gsmr:

In [11]:
%%bash

awk -v OFS='\t' 'BEGIN { print "SNP", "A1", "A2", "freq", "b", "se", "p", "N" } NR!=1 { print $1, $4, $5, $6, $7, $8, $10, $13 }' education_onlyrs_uniq > ~/alzheimersproject/gsmr_formatted/education_gsmr_form