In [7]:
import pandas as pd
import argparse as args
import numpy as np

#%load_ext rpy2.ipython

# Asthma formatting:

#### Reading data:

In [8]:
asthma = pd.DataFrame(pd.read_csv("~/alzheimersproject/1_raw_data/asthma_han", 
                delim_whitespace=True, header=0, na_values='NA'))

asthma

Unnamed: 0,SNP,CHR,BP,A1,A2,EAF,INFO,OR,OR_95L,OR_95U,P,N
0,1:692794_CA_C,1,692794,CA,C,0.881938,0.824483,1.000328,0.980323,1.020740,0.97,393859
1,rs12238997,1,693731,A,G,0.880549,0.875969,0.997675,0.978621,1.017101,0.81,393859
2,rs371890604,1,707522,G,C,0.900387,0.803693,0.994999,0.973548,1.016921,0.65,393859
3,rs149887893,1,714596,T,C,0.966844,0.844433,1.011821,0.975761,1.049213,0.53,393859
4,rs12184267,1,715265,C,T,0.963456,0.926915,1.020120,0.986930,1.054426,0.24,393859
...,...,...,...,...,...,...,...,...,...,...,...,...
9572551,rs9616985,22,51229805,T,C,0.926416,0.988928,0.982657,0.960667,1.005150,0.13,393859
9572552,rs368226325,22,51231220,A,G,0.945612,0.868414,0.999810,0.972428,1.027962,0.99,393859
9572553,rs200507571,22,51236013,A,AT,0.753879,0.803616,0.988319,0.973373,1.003495,0.13,393859
9572554,rs3896457,22,51237063,T,C,0.706810,0.852165,1.010189,0.996142,1.024434,0.16,393859


Z and log(OR) column is missing. 

#### Add Z and log(OR) column to data:

In [9]:
%%bash

# To add a Z column, log(OR), log(OR_95L), SE of OR columns are needed:
awk -v OFS='\t' 'BEGIN { print "SNP", "CHR", "BP", "A1", "A2", "FREQ", "INFO", "OR", "OR_95L", "OR_95U", "P", "N", "log_OR", "log_OR_95L", "SE_OR", "Z" } NR>2 { print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13 = (log($8)), $14 = (log($9)), $15 = (($13-$14)/1.96), $16 = ($13/$15) }' ~/alzheimersproject/1_raw_data/asthma_han > asthma_Z_logOR


#### Counting number of SNPs without rsID after the merge:

In [10]:
%%bash

grep "rs" asthma_Z_logOR | wc -l
wc -l asthma_Z_logOR

9104952
9572557 asthma_Z_logOR


#### Removing SNPs that do not contain rsID:

In [11]:
%%bash

awk -v OFS='\t' 'NR == 1 || $1 ~ "rs"' asthma_Z_logOR > asthma_Z_logOR_onlyrs


#### Removing possible duplicates:

In [12]:
%%bash

awk '!seen[$1]++' asthma_Z_logOR_onlyrs > asthma_Z_logOR_onlyrs_uniq

#### Checking file:

In [14]:
asthma = pd.DataFrame(pd.read_csv("~/alzheimersproject/2_formatting/asthma_Z_logOR_onlyrs_uniq", 
                delim_whitespace=True, header=0, na_values='NA'))
asthma

Unnamed: 0,SNP,CHR,BP,A1,A2,FREQ,INFO,OR,OR_95L,OR_95U,P,N,log_OR,log_OR_95L,SE_OR,Z
0,rs12238997,1,693731,A,G,0.880549,0.875969,0.997675,0.978621,1.017101,0.81,393859,-0.002328,-0.021611,0.009839,-0.236567
1,rs371890604,1,707522,G,C,0.900387,0.803693,0.994999,0.973548,1.016921,0.65,393859,-0.005014,-0.026808,0.011119,-0.450934
2,rs149887893,1,714596,T,C,0.966844,0.844433,1.011821,0.975761,1.049213,0.53,393859,0.011751,-0.024538,0.018515,0.634699
3,rs12184267,1,715265,C,T,0.963456,0.926915,1.020120,0.986930,1.054426,0.24,393859,0.019920,-0.013156,0.016876,1.180420
4,rs12184277,1,715367,A,G,0.963328,0.931148,1.018288,0.985286,1.052396,0.28,393859,0.018123,-0.014823,0.016810,1.078150
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9097107,rs9616985,22,51229805,T,C,0.926416,0.988928,0.982657,0.960667,1.005150,0.13,393859,-0.017495,-0.040128,0.011547,-1.515120
9097108,rs368226325,22,51231220,A,G,0.945612,0.868414,0.999810,0.972428,1.027962,0.99,393859,-0.000191,-0.027959,0.014167,-0.013447
9097109,rs200507571,22,51236013,A,AT,0.753879,0.803616,0.988319,0.973373,1.003495,0.13,393859,-0.011750,-0.026988,0.007775,-1.511300
9097110,rs3896457,22,51237063,T,C,0.706810,0.852165,1.010189,0.996142,1.024434,0.16,393859,0.010138,-0.003865,0.007144,1.419000


#### Checking for NaN, NA, and Inf:

In [15]:
#1. Checking for NaN:
print(asthma.isnull().sum())

print('\n------------------------------\n')

# 2. Checking for NA:
print(asthma.isna().sum())

print('\n------------------------------\n')

# 3. Checking for inf:
print((asthma.isin([np.inf, -np.inf])).sum())


SNP           0
CHR           0
BP            0
A1            0
A2            0
FREQ          0
INFO          0
OR            0
OR_95L        0
OR_95U        0
P             0
N             0
log_OR        0
log_OR_95L    0
SE_OR         0
Z             0
dtype: int64

------------------------------

SNP           0
CHR           0
BP            0
A1            0
A2            0
FREQ          0
INFO          0
OR            0
OR_95L        0
OR_95U        0
P             0
N             0
log_OR        0
log_OR_95L    0
SE_OR         0
Z             0
dtype: int64

------------------------------

SNP           0
CHR           0
BP            0
A1            0
A2            0
FREQ          0
INFO          0
OR            0
OR_95L        0
OR_95U        0
P             0
N             0
log_OR        0
log_OR_95L    0
SE_OR         0
Z             0
dtype: int64


#### Checking for P=0:

In [16]:
%%bash

awk -v OFS='\t' 'NR == 1 || $11 == "0.0" || $11 == "0"' asthma_Z_logOR_onlyrs_uniq | head

SNP	CHR	BP	A1	A2	FREQ	INFO	OR	OR_95L	OR_95U	P	N	log_OR	log_OR_95L	SE_OR	Z


#### Rearranging columns:

In [1]:
%%bash

awk -v OFS='\t' 'BEGIN { print "SNP", "CHR", "BP", "A1", "A2", "Z", "P", "N", "FREQ", "BETA", "SE" } NR>2 { print $1, $2, $3, $4, $5, $16, $11, $12, $6, $13, $15 }' asthma_Z_logOR_onlyrs_uniq > asthma_formatted
head asthma_formatted


SNP	CHR	BP	A1	A2	Z	P	N	FREQ	BETA	SE
rs371890604	1	707522	G	C	-0.450934	0.65	393859	0.900387	-0.00501404	0.0111192
rs149887893	1	714596	T	C	0.634699	0.53	393859	0.966844	0.0117512	0.0185147
rs12184267	1	715265	C	T	1.18042	0.24	393859	0.963456	0.0199204	0.0168757
rs12184277	1	715367	A	G	1.07815	0.28	393859	0.963328	0.0181232	0.0168095
rs12184279	1	717485	C	A	1.2327	0.22	393859	0.963636	0.0208697	0.01693
rs144155419	1	717587	G	A	-0.41018	0.68	393859	0.983881	-0.0108421	0.0264325
rs116801199	1	720381	G	T	1.19997	0.23	393859	0.963014	0.0200738	0.0167287
rs12565286	1	721290	G	C	1.09084	0.28	393859	0.96291	0.0181835	0.0166693
rs2977670	1	723891	G	C	-1.55187	0.12	393859	0.049277	-0.0237197	0.0152846


#### Rearranging columns for ldsc:

In [17]:
%%bash

# To add a Z column, log(OR), log(OR_95L), SE of OR columns are needed:
awk -v OFS='\t' 'BEGIN { print "SNP", "CHR", "BP", "A1", "A2", "FREQ", "P", "N", "Z" } NR>2 { print $1, $2, $3, $4, $5, $6, $11, $12, $16 }' asthma_Z_logOR_onlyrs_uniq > ~/alzheimersproject/ldsc_formatted/asthma_ldsc_form


#### Rearranging columns for gsmr:

In [18]:
%%bash

# To add a Z column, log(OR), log(OR_95L), SE of OR columns are needed:
awk -v OFS='\t' 'BEGIN { print "SNP", "A1", "A2", "freq", "b", "se", "p", "N" } NR>2 { print $1, $4, $5, $6, $13, $15, $11, $12 }' asthma_Z_logOR_onlyrs_uniq > ~/alzheimersproject/gsmr_formatted/asthma_gsmr_form
