In [1]:
import pandas as pd
import argparse as args
import numpy as np

#%load_ext rpy2.ipython

# Body Mass Index (BMI) formatting:

#### Reading data:

In [9]:
bmi = pd.DataFrame(pd.read_csv("~/alzheimersproject/1_raw_data/bmi_pulit", 
                delim_whitespace=True, header=0, na_values='NA'))

bmi

Unnamed: 0,CHR,BP,SNP,A1,A2,Freq_Tested_Allele,BETA,SE,P,N,INFO
0,1.0,54353.0,rs140052487:C:A,A,C,0.0006,-0.1194,0.0638,0.061310,484680.0,0.339663
1,1.0,54564.0,rs558796213:G:T,T,G,0.0005,0.0626,0.0527,0.234200,484680.0,0.705411
2,1.0,54591.0,rs561234294:A:G,A,G,0.9999,0.0472,0.1430,0.741500,484680.0,0.317431
3,1.0,54676.0,rs2462492:C:T,T,C,0.3941,0.0022,0.0033,0.507800,484680.0,0.340158
4,1.0,54763.0,rs548455890:T:G,T,G,0.9999,-0.0310,0.1073,0.772600,484680.0,0.667992
...,...,...,...,...,...,...,...,...,...,...,...
27381297,,,rs9923231,A,G,0.0000,-0.0203,0.0051,0.000069,,
27381298,,,rs9926577,A,G,0.0000,-0.0069,0.0057,0.226100,,
27381299,,,rs9927770,T,C,0.1667,-0.0022,0.0056,0.694400,,
27381300,,,rs9937553,T,C,0.7417,0.0070,0.0049,0.153100,,


Only the Z column is missing. 

#### Removing :A1:A2 from the SNP column:

In [2]:
%%bash

sed 's/:[A-Z]:[A-Z]//' ~/alzheimersproject/1_raw_data/bmi_pulit > bmi_rs

#### Removing potential non-rsIDs in the SNP column:

In [3]:
%%bash

awk -v OFS='\t' 'NR == 1 || $3 ~ "rs"' bmi_rs > bmi_onlyrs

#### Removing possible duplicates i the file:

In [8]:
%%bash

awk '!seen[$3]++' bmi_onlyrs > bmi_onlyrs_uniq

#### Adding CHR, BP, and N to BMI data when NaN values are present:

In [9]:
%%bash

# 1. Extracting rows where CHR, POS or N columns contain NAs:
awk '($2=="NA" || $1=="NA" || $10=="NA")' bmi_onlyrs_uniq > bmi_onlyrs_uniq_NA

# 2. Adding CHR from reference to bmi file based on rsID:
awk 'NR==FNR { FILE1[$3]=$1; next} ($3 in FILE1) {print FILE1[$3], $0}' ~/alzheimersproject/1_raw_data/hg19_ref bmi_onlyrs_uniq_NA > bmi_onlyrs_uniq_chr

# 3. Adding BP from reference to bmi file:
awk -v OFS='\t' 'NR==FNR { FILE1[$3]=$2; next} ($4 in FILE1) {print FILE1[$4], $0}' ~/alzheimersproject/1_raw_data/hg19_ref bmi_onlyrs_uniq_chr > bmi_onlyrs_uniq_chr_bp

# 4. Manually setting N column to sample size instead of NA:
awk '{$12 = 806834; print}' bmi_onlyrs_uniq_chr_bp > bmi_onlyrs_uniq_chr_bp_n

# 4A. Rearranging columns:
awk -v OFS='\t' '{ print $2, $1, $5, $6, $7, $8, $9, $10, $11, $12, $13}' bmi_onlyrs_uniq_chr_bp_n > bmi_onlyrs_uniq_chr_bp_n_cols

# 5. Removing rows that contain NA in CHR, BP, and N column in bmi file:
awk '($2!="NA" && $1!="NA" && $10!="NA")' bmi_onlyrs_uniq > bmi_onlyrs_uniq_noNA

# 6. Adding the rows that now contain CHR, BP and N column:
cat bmi_onlyrs_uniq_noNA bmi_onlyrs_uniq_chr_bp_n_cols > bmi_uniq_all_cols

In [13]:
%%bash

wc -l bmi_onlyrs_uniq_NA bmi_onlyrs_uniq_chr bmi_onlyrs_uniq_chr_bp bmi_onlyrs_uniq_chr_bp_n bmi_onlyrs_uniq_noNA bmi_uniq_all_cols

       604 bmi_onlyrs_uniq_NA
       368 bmi_onlyrs_uniq_chr
       368 bmi_onlyrs_uniq_chr_bp
       368 bmi_onlyrs_uniq_chr_bp_n
  27301333 bmi_onlyrs_uniq_noNA
  27301701 bmi_uniq_all_cols
  54604742 total


#### Checking if it worked:

In [14]:
bmi = pd.DataFrame(pd.read_csv("~/alzheimersproject/2_formatting/bmi_uniq_all_cols", 
                delim_whitespace=True, header=0, na_values='NA'))
bmi

Unnamed: 0,CHR,BP,SNP,A1,A2,Freq_Tested_Allele,BETA,SE,P,N,INFO
0,1,54353,rs140052487,A,C,0.0006,-0.1194,0.0638,0.06131,484680.0,0.339663
1,1,54564,rs558796213,T,G,0.0005,0.0626,0.0527,0.23420,484680.0,0.705411
2,1,54591,rs561234294,A,G,0.9999,0.0472,0.1430,0.74150,484680.0,0.317431
3,1,54676,rs2462492,T,C,0.3941,0.0022,0.0033,0.50780,484680.0,0.340158
4,1,54763,rs548455890,T,G,0.9999,-0.0310,0.1073,0.77260,484680.0,0.667992
...,...,...,...,...,...,...,...,...,...,...,...
27301695,17,68313649,rs9890823,A,C,0.9917,-0.0118,0.0288,0.68200,806834.0,
27301696,17,21565133,rs9907248,T,G,0.2667,0.0030,0.0070,0.66820,806834.0,
27301697,16,7864532,rs9927770,T,C,0.1667,-0.0022,0.0056,0.69440,806834.0,
27301698,16,23248436,rs9937553,T,C,0.7417,0.0070,0.0049,0.15310,806834.0,


#### Checking for NaN, NA, and inf.:

In [15]:
# 1. Checking for NaN:
print(bmi.isnull().sum())

print('\n------------------------------\n')

# 2. Checking for NA:
print(bmi.isna().sum())

print('\n------------------------------\n')

# 3. Checking for inf:
print((bmi.isin([np.inf, -np.inf])).sum())


CHR                       0
BP                        0
SNP                       0
A1                        0
A2                        0
Freq_Tested_Allele        0
BETA                      0
SE                        0
P                         0
N                         0
INFO                  22375
dtype: int64

------------------------------

CHR                       0
BP                        0
SNP                       0
A1                        0
A2                        0
Freq_Tested_Allele        0
BETA                      0
SE                        0
P                         0
N                         0
INFO                  22375
dtype: int64

------------------------------

CHR                   0
BP                    0
SNP                   0
A1                    0
A2                    0
Freq_Tested_Allele    0
BETA                  0
SE                    0
P                     0
N                     0
INFO                  0
dtype: int64


#### Checking for P=0:

In [16]:
%%bash

awk -v OFS='\t' 'NR == 1 || $9 == "0.0" || $9 == "0"' bmi_uniq_all_cols | head

CHR BP SNP A1 A2 Freq_Tested_Allele BETA SE P N INFO
16 53799507 rs9937053 A G 0.4372 0.0721 0.0016 0 805790 0.999538
16 53799905 rs9928094 A G 0.5634 -0.0719 0.0017 0 805753 0.999458
16 53799977 rs9930333 T G 0.5618 -0.0719 0.0017 0 789701 0.999781
16 53800568 rs9939973 A G 0.4367 0.072 0.0016 0 805864 0.999897
16 53800629 rs9940646 C G 0.5618 -0.0719 0.0017 0 805568 0.999902
16 53800754 rs9940128 A G 0.4367 0.072 0.0016 0 805661 1
16 53800954 rs1421085 T C 0.59 -0.0751 0.0016 0 802862 1
16 53801549 rs9923147 T C 0.4366 0.072 0.0016 0 805825 0.999882
16 53801985 rs9923544 T C 0.4367 0.072 0.0016 0 805792 0.999786


#### Setting P=0 to 1e-269:

In [1]:
%%bash

awk -v OFS='\t' '$9 == "0"{$9=1e-269} 1' bmi_uniq_all_cols > bmi_uniq_all_cols_p
head bmi_uniq_all_cols_p

CHR BP SNP A1 A2 Freq_Tested_Allele BETA SE P N INFO
1 54353 rs140052487 A C 6e-04 -0.1194 0.0638 0.06131 484680 0.339663
1 54564 rs558796213 T G 5e-04 0.0626 0.0527 0.2342 484680 0.705411
1 54591 rs561234294 A G 0.9999 0.0472 0.143 0.7415 484680 0.317431
1 54676 rs2462492 T C 0.3941 0.0022 0.0033 0.5078 484680 0.340158
1 54763 rs548455890 T G 0.9999 -0.031 0.1073 0.7726 484680 0.667992
1 55326 rs3107975 T C 0.991 -9e-04 0.0177 0.9613 484680 0.324228
1 55351 rs531766459 A T 5e-04 0.0708 0.0562 0.2079 484680 0.605186
1 55405 rs372455836 T C 3e-04 -0.2055 0.0899 0.02226 484680 0.36667
1 55416 rs193242050 A G 3e-04 0.0721 0.0843 0.3927 484680 0.465027


#### Adding Z column:

In [1]:
%%bash
awk -v OFS='\t' 'BEGIN { print "SNP", "CHR", "BP", "A1", "A2", "Z", "P", "N", "FREQ", "BETA", "SE" } NR!=1 { print $3, int($1), int($2), $4, $5, $12=$7/$8, $9, int($10), $6, $7, $8 }' bmi_uniq_all_cols_p > bmi_formatted
head bmi_formatted

SNP	CHR	BP	A1	A2	Z	P	N	FREQ	BETA	SE
rs140052487	1	54353	A	C	-1.87147	0.06131	484680	6e-04	-0.1194	0.0638
rs558796213	1	54564	T	G	1.18786	0.2342	484680	5e-04	0.0626	0.0527
rs561234294	1	54591	A	G	0.33007	0.7415	484680	0.9999	0.0472	0.143
rs2462492	1	54676	T	C	0.666667	0.5078	484680	0.3941	0.0022	0.0033
rs548455890	1	54763	T	G	-0.28891	0.7726	484680	0.9999	-0.031	0.1073
rs3107975	1	55326	T	C	-0.0508475	0.9613	484680	0.991	-9e-04	0.0177
rs531766459	1	55351	A	T	1.25979	0.2079	484680	5e-04	0.0708	0.0562
rs372455836	1	55405	T	C	-2.28587	0.02226	484680	3e-04	-0.2055	0.0899
rs193242050	1	55416	A	G	0.855279	0.3927	484680	3e-04	0.0721	0.0843


#### Rearranging columns for ldsc:

In [2]:
%%bash
awk -v OFS='\t' 'BEGIN { print "SNP", "CHR", "BP", "A1", "A2", "FREQ", "P", "N", "Z", "BETA", "SE" } NR!=1 { print $3, $1, $2, $4, $5, $6, $9, $10, $11, $7, $8 }' bmi_form > ~/alzheimersproject/ldsc_formatted/bmi_ldsc_form

#### Rearranging columns for gsmr:

In [3]:
%%bash
awk -v OFS='\t' 'BEGIN { print "SNP", "A1", "A2", "freq", "b", "se", "p", "N" } NR!=1 { print $3, $4, $5, $6, $7, $8, $9, $10 }' bmi_form > ~/alzheimersproject/gsmr_formatted/bmi_gsmr_form