In [1]:
import pandas as pd
import argparse as args
import numpy as np

#%load_ext rpy2.ipython

# Low Density Lipoprotein (LDL) formatting:

#### Reading the data:

In [2]:
ldl = pd.DataFrame(pd.read_csv("~/alzheimersproject/1_raw_data/LDL_graham", 
                delim_whitespace=True, header=0, na_values='NA'))

ldl

Unnamed: 0,variant_id,chromosome,base_pair_location,other_allele,effect_allele,n,N_studies,effect_allele_frequency,beta,standard_error,p_value
0,rs367896724,1,10177,A,AC,7977,6,0.349000,-0.036944,0.027669,0.182
1,rs145072688,1,10352,T,TA,7977,6,0.409000,-0.025439,0.026996,0.346
2,rs534229142,1,10511,G,A,393514,2,0.001290,0.087901,0.053697,0.102
3,rs537182016,1,10539,C,A,15524,5,0.001130,-0.115317,0.179981,0.522
4,rs376342519,1,10616,C,CCGCCGTTGCAAAGGCGCGCCG,404744,8,0.005280,-0.019321,0.024885,0.438
...,...,...,...,...,...,...,...,...,...,...,...
47006478,rs562711702,22,51241101,A,T,392404,2,0.000154,-0.240122,0.178803,0.179
47006479,rs533336397,22,51241102,T,C,392404,2,0.000154,-0.240122,0.178803,0.179
47006480,rs7287738,22,51241285,T,G,401560,3,0.000511,0.214583,0.167206,0.199
47006481,rs568168135,22,51241386,C,G,415128,10,0.009190,-0.016995,0.019978,0.395


From the table above, Z is the only missing columns.

#### Adding an Z column:

In [3]:
%%bash

awk -v OFS='\t' 'BEGIN { print "SNP", "CHR", "BP", "A1", "A2", "Z", "P", "N", "FREQ", "BETA", "SE" } NR>2 { print $1, $2, $3, $5, $4, $12 = ($9/$10), $11, $6, $8, $9, $10  }' ~/alzheimersproject/1_raw_data/LDL_graham > ldl_all_cols


#### Removing potential non-rsIDs in the SNP column:

In [4]:
%%bash

awk -v OFS='\t' 'NR == 1 || $1 ~ "rs"' ldl_all_cols > ldl_onlyrs

#### Removing possible duplicates in the file:

In [1]:
%%bash

awk '!seen[$1]++' ldl_onlyrs > ldl_formatted

#### Checking the file:

In [6]:
ldl = pd.DataFrame(pd.read_csv("~/alzheimersproject/2_formatting/ldl_formatted", 
                delim_whitespace=True, header=0, na_values='NA'))

ldl

Unnamed: 0,SNP,CHR,BP,A1,A2,Z,P,N,FREQ,BETA,SE
0,rs145072688,1,10352,TA,T,-0.942311,0.346,7977,0.409000,-0.025439,0.026996
1,rs534229142,1,10511,A,G,1.636980,0.102,393514,0.001290,0.087901,0.053697
2,rs537182016,1,10539,A,C,-0.640718,0.522,15524,0.001130,-0.115317,0.179981
3,rs376342519,1,10616,CCGCCGTTGCAAAGGCGCGCCG,C,-0.776418,0.438,404744,0.005280,-0.019321,0.024885
4,rs558604819,1,10642,A,G,-0.655884,0.512,393543,0.000219,-0.087914,0.134039
...,...,...,...,...,...,...,...,...,...,...,...
44473869,rs562711702,22,51241101,T,A,-1.342940,0.179,392404,0.000154,-0.240122,0.178803
44473870,rs533336397,22,51241102,C,T,-1.342940,0.179,392404,0.000154,-0.240122,0.178803
44473871,rs7287738,22,51241285,G,T,1.283350,0.199,401560,0.000511,0.214583,0.167206
44473872,rs568168135,22,51241386,G,C,-0.850689,0.395,415128,0.009190,-0.016995,0.019978


#### Checking for NaN, NA, inf:

In [7]:
# 1. Checking for NaN:
print(ldl.isnull().sum())

print('\n------------------------------\n')

# 2. Checking for NA:
print(ldl.isna().sum())

print('\n------------------------------\n')

# 3. Checking for inf:
print((ldl.isin([np.inf, -np.inf])).sum())

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64

------------------------------

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64

------------------------------

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64


#### Checking for P=0:

In [2]:
%%bash

# Checking for P=0:
awk -v OFS='\t' 'NR == 1 || $7 == "0.0" || $7 == "0"' ldl_formatted | head


SNP	CHR	BP	A1	A2	Z	P	N	FREQ	BETA	SE


#### Rearranging columns for ldsc:

In [9]:
%%bash
awk -v OFS='\t' 'BEGIN { print "SNP", "CHR", "BP", "A1", "A2", "FREQ", "P", "N", "Z" } NR!=1 { print $1, $2, $3, $4, $5, $9, $7, $8, $6 }' ldl_onlyrs_uniq > ~/alzheimersproject/ldsc_formatted/ldl_ldsc_form

#### Rearranging columns for gsmr:

In [10]:
%%bash
awk -v OFS='\t' 'BEGIN { print "SNP", "A1", "A2", "freq", "b", "se", "p", "N" } NR!=1 { print $1, $4, $5, $9, $10, $11, $7, $8 }' ldl_onlyrs_uniq > ~/alzheimersproject/gsmr_formatted/ldl_gsmr_form