In [1]:
import pandas as pd
import argparse as args
import numpy as np

#%load_ext rpy2.ipython

# High Density Lipoprotein (HDL) formatting:

#### Reading the data:

In [2]:
hdl = pd.DataFrame(pd.read_csv("~/alzheimersproject/1_raw_data/HDL_graham", 
                delim_whitespace=True, header=0, na_values='NA'))

hdl

Unnamed: 0,rsID,CHROM,POS_b37,REF,ALT,N,N_studies,POOLED_ALT_AF,EFFECT_SIZE,SE,pvalue_neg_log10,pvalue,pvalue_neg_log10_GC,pvalue_GC
0,rs367896724,1,10177,A,AC,8160,5,0.349000,0.001989,0.027490,0.025804,0.942,0.019549,0.956
1,rs145072688,1,10352,T,TA,8160,5,0.410000,-0.010981,0.026816,0.166092,0.682,0.122170,0.755
2,rs534229142,1,10511,G,A,361237,2,0.001300,0.007658,0.055868,0.050135,0.891,0.037757,0.917
3,rs537182016,1,10539,C,A,16026,5,0.001150,-0.149708,0.175335,0.405393,0.393,0.288296,0.515
4,rs376342519,1,10616,C,CCGCCGTTGCAAAGGCGCGCCG,372627,7,0.005320,0.022585,0.025871,0.417185,0.383,0.296288,0.505
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46150903,rs562711702,22,51241101,A,T,360129,2,0.000153,0.092548,0.185569,0.209032,0.618,0.152645,0.704
46150904,rs533336397,22,51241102,T,C,360129,2,0.000153,0.092548,0.185569,0.209032,0.618,0.152645,0.704
46150905,rs7287738,22,51241285,T,G,369744,3,0.000514,-0.086517,0.169284,0.215170,0.609,0.156973,0.697
46150906,rs568168135,22,51241386,C,G,383118,9,0.009140,0.027518,0.020777,0.731987,0.185,0.505293,0.312


From the table above, Z is the only missing column.

#### Adding an N column:

In [3]:
%%bash

awk -v OFS='\t' 'BEGIN { print "SNP", "CHR", "BP", "A1", "A2", "Z", "P", "N", "FREQ", "BETA", "SE" } NR>2 { print $1, $2, $3, $5, $4, $15 = ($9/$10), $12, $6, $8, $9, $10 }' ~/alzheimersproject/1_raw_data/HDL_graham > hdl_all_cols


#### Removing potential non-rsIDs in the SNP column:

In [4]:
%%bash

awk -v OFS='\t' 'NR == 1 || $1 ~ "rs"' hdl_all_cols > hdl_onlyrs

#### Removing possible duplicates i the file:

In [1]:
%%bash

awk '!seen[$1]++' hdl_onlyrs > hdl_formatted

#### Checking the file:

In [6]:
hdl = pd.DataFrame(pd.read_csv("~/alzheimersproject/2_formatting/hdl_formatted", 
                delim_whitespace=True, header=0, na_values='NA'))

hdl

Unnamed: 0,SNP,CHR,BP,A1,A2,Z,P,N,FREQ,BETA,SE
0,rs145072688,1,10352,TA,T,-0.409471,0.682,8160,0.410000,-0.010981,0.026816
1,rs534229142,1,10511,A,G,0.137071,0.891,361237,0.001300,0.007658,0.055868
2,rs537182016,1,10539,A,C,-0.853840,0.393,16026,0.001150,-0.149708,0.175335
3,rs376342519,1,10616,CCGCCGTTGCAAAGGCGCGCCG,C,0.873003,0.383,372627,0.005320,0.022585,0.025871
4,rs558604819,1,10642,A,G,1.043700,0.297,361266,0.000216,0.148159,0.141956
...,...,...,...,...,...,...,...,...,...,...,...
44562532,rs562711702,22,51241101,T,A,0.498728,0.618,360129,0.000153,0.092548,0.185569
44562533,rs533336397,22,51241102,C,T,0.498728,0.618,360129,0.000153,0.092548,0.185569
44562534,rs7287738,22,51241285,G,T,-0.511075,0.609,369744,0.000514,-0.086517,0.169284
44562535,rs568168135,22,51241386,G,C,1.324430,0.185,383118,0.009140,0.027518,0.020777


#### Checking for NaN, NA, inf:

In [7]:
# 1. Checking for NaN:
print(hdl.isnull().sum())

print('\n------------------------------\n')

# 2. Checking for NA:
print(hdl.isna().sum())

print('\n------------------------------\n')

# 3. Checking for inf:
print((hdl.isin([np.inf, -np.inf])).sum())

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64

------------------------------

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64

------------------------------

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64


In [2]:
%%bash

# Checking for P=0:
awk -v OFS='\t' 'NR == 1 || $7 == "0.0" || $7 == "0"' hdl_formatted | head

SNP	CHR	BP	A1	A2	Z	P	N	FREQ	BETA	SE


#### Rearranging columns for ldsc:

In [9]:
%%bash
awk -v OFS='\t' 'BEGIN { print "SNP", "CHR", "BP", "A1", "A2", "FREQ", "P", "N", "Z" } NR!=1 { print $1, $2, $3, $4, $5, $9, $7, $8, $6 }' hdl_onlyrs_uniq > ~/alzheimersproject/ldsc_formatted/hdl_ldsc_form

#### Rearranging columns for gsmr:

In [10]:
%%bash
awk -v OFS='\t' 'BEGIN { print "SNP", "A1", "A2", "freq", "b", "se", "p", "N" } NR!=1 { print $1, $4, $5, $9, $10, $11, $7, $8 }' hdl_onlyrs_uniq > ~/alzheimersproject/gsmr_formatted/hdl_gsmr_form