In [1]:
import pandas as pd
import argparse as args
import numpy as np

#%load_ext rpy2.ipython

# Hypertension formatting:

#### Reading the data:

In [2]:
hypertension = pd.DataFrame(pd.read_csv("~/alzheimersproject/1_raw_data/hypertension_zhu", 
                delim_whitespace=True, header=0, na_values='NA'))

hypertension

Unnamed: 0,SNP,CHR,BP,A1,A2,FREQ,HWEP,INFO,BETA,SE,P
0,11:1373806_C_A,11,1373806,C,A,0.02781,0.043880,0.894853,0.008090,0.011217,0.48
1,11:1686076_A_G,11,1686076,A,G,0.11230,0.331000,0.956264,0.007756,0.005943,0.32
2,11:26315102_A_G,11,26315102,A,G,0.01282,0.013860,0.883669,0.003526,0.015519,0.86
3,11:402546_A_C,11,402546,A,C,0.11530,0.004139,0.837212,0.006674,0.005884,0.21
4,11:43891845_G_T,11,43891845,G,T,0.03922,0.952800,0.816099,-0.009298,0.009063,0.33
...,...,...,...,...,...,...,...,...,...,...,...
5265183,rs9999981,4,139575905,A,G,0.38520,0.037470,0.976159,0.001749,0.003908,0.74
5265184,rs9999982,4,122776933,G,A,0.27180,0.322000,0.996441,0.004001,0.004224,0.34
5265185,rs9999987,4,4936161,C,T,0.04687,0.000092,0.958540,-0.008885,0.008824,0.30
5265186,rs9999992,4,122902084,G,A,0.08992,0.077970,0.843684,0.001482,0.006331,0.78


It seems like only the N and Z column is missing. 

#### Checking for NaN, NA, inf:

In [4]:
# 1. Checking for NaN:
print(hypertension.isnull().sum())

print('\n------------------------------\n')

# 2. Checking for NA:
print(hypertension.isna().sum())

print('\n------------------------------\n')

# 3. Checking for inf:
print((hypertension.isin([np.inf, -np.inf])).sum())

SNP     0
CHR     0
BP      0
A1      0
A2      0
FREQ    0
HWEP    0
INFO    0
BETA    0
SE      2
P       0
dtype: int64

------------------------------

SNP     0
CHR     0
BP      0
A1      0
A2      0
FREQ    0
HWEP    0
INFO    0
BETA    0
SE      2
P       0
dtype: int64

------------------------------

SNP     0
CHR     0
BP      0
A1      0
A2      0
FREQ    0
HWEP    0
INFO    0
BETA    0
SE      0
P       0
dtype: int64


#### Checking for se=0 and p=0:

In [5]:
%%bash

# Checking for P=0:
awk -v OFS='\t' 'NR == 1 || $10 == "nan" || $10 == "NaN"' ~/alzheimersproject/1_raw_data/hypertension_zhu | more

echo '--------------------------------'

awk -v OFS='\t' 'NR == 1 || $11 == "0" || $11 == "0.0"' ~/alzheimersproject/1_raw_data/hypertension_zhu | more

SNP	CHR	BP	A1	A2	FREQ	HWEP	INFO	BETA	SE	P
rs4673080	2	224840096	A	C	0.4404	1	0.85667	3.99785e+10	nan	1.0E+00
rs4697790	4	10841021	A	C	0.4732	1	0.860489	1.94924e+10	nan	1.0E+00
--------------------------------
SNP	CHR	BP	A1	A2	FREQ	HWEP	INFO	BETA	SE	P


#### Deleting variants with SE=0:

In [6]:
%%bash

awk -v OFS='\t' '$10 != "nan"' ~/alzheimersproject/1_raw_data/hypertension_zhu > hypertension_nonan

#### Adding N and Z columns:

In [7]:
%%bash

awk -v OFS='\t' 'BEGIN { print "SNP", "CHR", "BP", "A1", "A2", "Z", "P", "N", "FREQ", "BETA", "SE" } NR>2 { print $1, $2, $3, $4, $5, $12 = ($9/$10), $11, $13 = (458554), $6, $9, $10 }' hypertension_nonan > hypertension_all_cols


#### Removing potential non-rsIDs in the SNP column:

In [8]:
%%bash

awk -v OFS='\t' 'NR == 1 || $1 ~ "rs"' hypertension_all_cols > hypertension_onlyrs

#### Removing possible duplicates in the file:

In [1]:
%%bash

awk '!seen[$1]++' hypertension_onlyrs > hypertension_formatted

In [1]:
%%bash

head hypertension_formatted 

SNP	CHR	BP	A1	A2	Z	P	N	FREQ	BETA	SE
rs10	7	92383888	A	C	0.79532	3.4E-01	458554	0.05004	0.00657193	0.00826325
rs1000000	12	126890980	G	A	1.80028	5.6E-02	458554	0.2237	0.00810902	0.0045043
rs10000003	4	57561647	A	G	-0.0680976	9.5E-01	458554	0.294	-0.000280695	0.00412195
rs10000010	4	21618674	T	C	0.509005	5.9E-01	458554	0.4866	0.00193139	0.00379444
rs10000012	4	1357325	C	G	0.395443	7.3E-01	458554	0.1405	0.00214978	0.00543638
rs10000013	4	37225069	C	A	-0.793294	4.1E-01	458554	0.2113	-0.00364394	0.00459343
rs10000014	4	178447148	C	T	-0.691843	6.1E-01	458554	0.3497	-0.00272813	0.00394328
rs10000017	4	84778125	C	T	-1.66339	7.4E-02	458554	0.2137	-0.00757298	0.00455273
rs10000018	4	100458448	A	G	-0.623255	4.8E-01	458554	0.3007	-0.00257929	0.00413842


#### Rearranging columns for ldsc:

In [11]:
%%bash
awk -v OFS='\t' 'BEGIN { print "SNP", "CHR", "BP", "A1", "A2", "FREQ", "P", "N", "Z" } NR!=1 { print $1, $2, $3, $4, $5, $9, $7, $8, $6 }' hypertension_onlyrs_uniq > ~/alzheimersproject/ldsc_formatted/hypertension_ldsc_form

#### Rearranging columns for gsmr:

In [12]:
%%bash
awk -v OFS='\t' 'BEGIN { print "SNP", "A1", "A2", "freq", "b", "se", "p", "N" } NR!=1 { print $1, $4, $5, $9, $10, $11, $7, $8 }' hypertension_onlyrs_uniq > ~/alzheimersproject/gsmr_formatted/hypertension_gsmr_form