In [1]:
import pandas as pd
import argparse as args
import numpy as np


# Depression formatting:

#### Reading data:

In [2]:
depression = pd.DataFrame(pd.read_csv("~/alzheimersproject/1_raw_data/depression_howard", 
                delim_whitespace=True, header=0, na_values='NA'))

depression

Unnamed: 0,MarkerName,A1,A2,Freq,LogOR,StdErrLogOR,P
0,rs2326918,a,g,0.8452,0.0106,0.0060,0.07561
1,rs7929618,c,g,0.1314,-0.0224,0.0064,0.00048
2,rs66941928,t,c,0.8031,0.0003,0.0055,0.95020
3,rs7190157,a,c,0.3517,0.0024,0.0045,0.59920
4,rs12364336,a,g,0.8685,0.0075,0.0064,0.24500
...,...,...,...,...,...,...,...
8483296,rs2414744,t,c,0.3555,0.0021,0.0046,0.64240
8483297,rs7262834,t,g,0.3896,0.0099,0.0044,0.02492
8483298,rs4140461,t,c,0.8505,0.0045,0.0061,0.46270
8483299,rs117241566,t,c,0.9387,-0.0096,0.0089,0.28410


Z, N, CHR, and BP columns are missing.

#### Adding Z and N column:

In [3]:
%%bash

awk -v OFS='\t' 'BEGIN { print "SNP", "A1", "A2", "FREQ", "log_OR", "SE_OR", "P", "Z", "N" } NR>2 { print $1, toupper($2), toupper($3), $4, $5, $6, $7, $8 = ($5/$6), $9 = (807553) }' ~/alzheimersproject/1_raw_data/depression_howard > depression_Z_N


#### Adding CHR and BP:

In [4]:
%%bash

#Adding BP from reference to depression file based on rsID:
awk 'NR==FNR { FILE1[$3]=$2; next} ($1 in FILE1) {print FILE1[$1], $0}' ~/alzheimersproject/1_raw_data/hg19_ref depression_Z_N > depression_Z_N_BP

#Adding BP from reference to depression file:
awk -v OFS='\t' 'NR==FNR { FILE1[$3]=$1; next} ($2 in FILE1) {print FILE1[$2], $0}' ~/alzheimersproject/1_raw_data/hg19_ref depression_Z_N_BP > depression_Z_N_BP_CHR


#### Removing potential non-rsIDs in the SNP column:

In [10]:
%%bash

awk -v OFS='\t' 'NR == 1 || $3 ~ "rs"' depression_Z_N_BP_CHR > depression_onlyrs

#### Removing possible duplicates i the file:

In [11]:
%%bash

awk '!seen[$3]++' depression_onlyrs > depression_onlyrs_uniq

#### Adding column names again:

In [13]:
%%bash

awk -v OFS='\t' 'BEGIN { print "CHR", "BP", "SNP", "A1", "A2", "FREQ", "log_OR", "SE_OR", "P", "Z", "N" } NR>2 { print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11 }' depression_onlyrs_uniq > depression_all_cols


#### Checking if it worked:

In [14]:
depression = pd.DataFrame(pd.read_csv("~/alzheimersproject/2_formatting/depression_all_cols", 
                delim_whitespace=True, header=0, na_values='NA'))

depression

Unnamed: 0,CHR,BP,SNP,A1,A2,FREQ,log_OR,SE_OR,P,Z,N
0,16,8600861,rs7190157,A,C,0.3517,0.0024,0.0045,0.59920,0.533333,807553
1,11,100009976,rs12364336,A,G,0.8685,0.0075,0.0064,0.24500,1.171880,807553
2,7,145771806,rs6977693,T,C,0.8544,0.0089,0.0061,0.14420,1.459020,807553
3,1,166367755,rs12562373,A,G,0.7495,0.0074,0.0050,0.13540,1.480000,807553
4,12,3840048,rs4766166,A,G,0.5951,0.0014,0.0044,0.74690,0.318182,807553
...,...,...,...,...,...,...,...,...,...,...,...
8481284,15,62030949,rs2414744,T,C,0.3555,0.0021,0.0046,0.64240,0.456522,807553
8481285,20,33400913,rs7262834,T,G,0.3896,0.0099,0.0044,0.02492,2.250000,807553
8481286,1,84241498,rs4140461,T,C,0.8505,0.0045,0.0061,0.46270,0.737705,807553
8481287,12,44074208,rs117241566,T,C,0.9387,-0.0096,0.0089,0.28410,-1.078650,807553


#### Checking for NaN, NA, inf:

In [15]:
# 1. Checking for NaN:
print(depression.isnull().sum())

print('\n------------------------------\n')

# 2. Checking for NA:
print(depression.isna().sum())

print('\n------------------------------\n')

# 3. Checking for inf:
print((depression.isin([np.inf, -np.inf])).sum())

CHR       0
BP        0
SNP       0
A1        0
A2        0
FREQ      0
log_OR    0
SE_OR     0
P         0
Z         0
N         0
dtype: int64

------------------------------

CHR       0
BP        0
SNP       0
A1        0
A2        0
FREQ      0
log_OR    0
SE_OR     0
P         0
Z         0
N         0
dtype: int64

------------------------------

CHR       0
BP        0
SNP       0
A1        0
A2        0
FREQ      0
log_OR    0
SE_OR     0
P         0
Z         0
N         0
dtype: int64


#### Checking for P=0:

In [16]:
%%bash

# Checking for P=0:
awk -v OFS='\t' 'NR == 1 || $9 == "0.0" || $9 == "0"' depression_all_cols | head

CHR	BP	SNP	A1	A2	FREQ	log_OR	SE_OR	P	Z	N


#### Rearranging columns:

In [1]:
%%bash

awk -v OFS='\t' 'BEGIN { print "SNP", "CHR", "BP", "A1", "A2", "Z", "P", "N", "FREQ", "BETA", "SE" } NR>2 { print $3, $1, $2, $4, $5, $10, $9, $11, $6, $7, $8 }' depression_all_cols > depression_formatted