In [1]:
import pandas as pd
import argparse as args
import numpy as np

#%load_ext rpy2.ipython

# Type 2 Diabetes (T2D) formatting:

In [2]:
t2d = pd.DataFrame(pd.read_csv("~/alzheimersproject/1_raw_data/T2D_mahajan", 
                delim_whitespace=True, header=0, na_values='NA'))

t2d

Unnamed: 0,chromosome(b37),position(b37),chrposID,rsID,effect_allele,other_allele,effect_allele_frequency,Fixed-effects_beta,Fixed-effects_SE,Fixed-effects_p-value
0,1,693731,chr1:693731,rs12238997,a,g,0.8747,0.0044,0.0119,0.71240
1,1,706368,chr1:706368,rs55727773,a,g,0.4964,0.0118,0.0099,0.23550
2,1,721290,chr1:721290,rs12565286,c,g,0.0359,-0.0176,0.0262,0.50130
3,1,729679,chr1:729679,rs4951859,c,g,0.1656,-0.0043,0.0105,0.68130
4,1,730087,chr1:730087,rs148120343,t,c,0.9409,0.0001,0.0172,0.99540
...,...,...,...,...,...,...,...,...,...,...
10454870,22,51221731,chr22:51221731,rs115055839,t,c,0.9288,-0.0032,0.0142,0.82220
10454871,22,51222100,chr22:51222100,rs114553188,t,g,0.0570,0.0100,0.0151,0.50720
10454872,22,51229805,chr22:51229805,rs9616985,t,c,0.9289,-0.0011,0.0143,0.93890
10454873,22,51234048,chr22:51234048,rs141330630,t,c,0.9975,-0.2358,0.1322,0.07456


From the table above, Z and N columns are the only missing columns. 

#### Adding an N and Z column:

In [3]:
%%bash

awk -v OFS='\t' 'BEGIN { print "SNP", "CHR", "BP", "A1", "A2", "Z", "P", "N", "FREQ", "BETA", "SE"  } NR>2 { print $4, $1, $2, toupper($5), toupper($6), $12 = ($8/$9), $10, $11 = (936700), $7, $8, $9 }' ~/alzheimersproject/1_raw_data/T2D_mahajan > t2d_all_cols


#### Removing potential non-rsIDs in the SNP column:

In [4]:
%%bash

awk -v OFS='\t' 'NR == 1 || $1 ~ "rs"' t2d_all_cols > t2d_onlyrs

#### Removing possible duplicates in the file:

In [1]:
%%bash

awk '!seen[$1]++' t2d_onlyrs > t2d_formatted

#### Checking the file:

In [6]:
t2d = pd.DataFrame(pd.read_csv("~/alzheimersproject/2_formatting/t2d_formatted", 
                delim_whitespace=True, header=0, na_values='NA'))

t2d

Unnamed: 0,SNP,CHR,BP,A1,A2,Z,P,N,FREQ,BETA,SE
0,rs55727773,1,706368,A,G,1.191920,0.23550,936700,0.4964,0.0118,0.0099
1,rs12565286,1,721290,C,G,-0.671756,0.50130,936700,0.0359,-0.0176,0.0262
2,rs4951859,1,729679,C,G,-0.409524,0.68130,936700,0.1656,-0.0043,0.0105
3,rs148120343,1,730087,T,C,0.005814,0.99540,936700,0.9409,0.0001,0.0172
4,rs142557973,1,731718,T,C,0.792793,0.42780,936700,0.8694,0.0088,0.0111
...,...,...,...,...,...,...,...,...,...,...,...
10454795,rs115055839,22,51221731,T,C,-0.225352,0.82220,936700,0.9288,-0.0032,0.0142
10454796,rs114553188,22,51222100,T,G,0.662252,0.50720,936700,0.0570,0.0100,0.0151
10454797,rs9616985,22,51229805,T,C,-0.076923,0.93890,936700,0.9289,-0.0011,0.0143
10454798,rs141330630,22,51234048,T,C,-1.783660,0.07456,936700,0.9975,-0.2358,0.1322


#### Checking for NaN, NA, inf:

In [7]:
# 1. Checking for NaN:
print(t2d.isnull().sum())

print('\n------------------------------\n')

# 2. Checking for NA:
print(t2d.isna().sum())

print('\n------------------------------\n')

# 3. Checking for inf:
print((t2d.isin([np.inf, -np.inf])).sum())

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64

------------------------------

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64

------------------------------

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64


#### Checking P=0:

In [2]:
%%bash

# Checking for P=0:
awk -v OFS='\t' 'NR == 1 || $7 == "0.0" || $7 == "0"' t2d_formatted | head


SNP	CHR	BP	A1	A2	Z	P	N	FREQ	BETA	SE


#### Rearranging columns for ldsc:

In [9]:
%%bash
awk -v OFS='\t' 'BEGIN { print "SNP", "CHR", "BP", "A1", "A2", "FREQ", "P", "N", "Z" } NR!=1 { print $1, $2, $3, $4, $5, $9, $7, $8, $6 }' t2d_onlyrs_uniq > ~/alzheimersproject/ldsc_formatted/t2d_ldsc_form

#### Rearranging columns for gsmr:

In [10]:
%%bash
awk -v OFS='\t' 'BEGIN { print "SNP", "A1", "A2", "freq", "b", "se", "p", "N" } NR!=1 { print $1, $4, $5, $9, $10, $11, $7, $8 }' t2d_onlyrs_uniq > ~/alzheimersproject/gsmr_formatted/t2d_gsmr_form