In [1]:
import pandas as pd
import argparse as args
import numpy as np

#%load_ext rpy2.ipython

# Vitamin D formatting:

In [2]:
vitaD = pd.DataFrame(pd.read_csv("~/alzheimersproject/1_raw_data/vitaD_revez", 
                delim_whitespace=True, header=0, na_values='NA'))

vitaD

Unnamed: 0,CHR,SNP,POS,A1,A2,N,AF1,BETA,SE,P
0,1,rs144155419,717587,G,A,408891,0.989511,-0.017281,0.009871,0.080006
1,1,rs58276399,731718,T,C,399209,0.888795,-0.002208,0.003238,0.495386
2,1,rs141242758,734349,T,C,400370,0.888887,-0.002534,0.003235,0.433407
3,1,rs28544273,751343,T,A,410389,0.878942,-0.001288,0.003076,0.675419
4,1,rs28527770,751756,T,C,410670,0.878657,-0.001370,0.003072,0.655701
...,...,...,...,...,...,...,...,...,...,...
8806775,25,rs73237067,155223923,T,C,416560,0.974100,0.000100,0.006200,0.988100
8806776,25,rs3093533,155227118,C,G,408994,0.015400,-0.006000,0.008100,0.459700
8806777,25,rs3093457,155227607,T,G,416560,0.707500,-0.002100,0.002200,0.327600
8806778,25,rs1883079,155228901,A,G,416560,0.059200,-0.000500,0.004200,0.913300


From the table above, Z is the only missing column. 

#### Adding a Z column:

In [3]:
%%bash

awk -v OFS='\t' 'BEGIN { print "SNP", "CHR", "BP", "A1", "A2", "Z", "P", "N", "FREQ", "BETA", "SE"  } NR>2 { print $2, $1, $3, $4, $5, $11 = ($8/$9), $10, $6, $7, $8, $9 }' ~/alzheimersproject/1_raw_data/vitaD_revez > vita_d_all_cols


#### Removing potential non-rsIDs in the SNP column and chromosome 25:

In [17]:
%%bash

awk -v OFS='\t' 'NR == 1 || $1 ~ "rs"' vita_d_all_cols > vita_d_onlyrs
awk -v OFS='\t' 'NR == 1 || $2 != "25"' vita_d_onlyrs > vita_d_onlyrs_no25
awk -v OFS='\t' 'NR == 1 || $2 != "23"' vita_d_onlyrs_no25 > vita_d_onlyrs_chr

#### Removing possible duplicates in the file:

In [18]:
%%bash

awk '!seen[$1]++' vita_d_onlyrs_chr > vita_d_onlyrs_uniq

#### Checking the file:

In [19]:
vitaD = pd.DataFrame(pd.read_csv("~/alzheimersproject/2_formatting/vita_d_onlyrs_uniq", 
                delim_whitespace=True, header=0, na_values='NA'))

vitaD

Unnamed: 0,SNP,CHR,BP,A1,A2,Z,P,N,FREQ,BETA,SE
0,rs58276399,1,731718,T,C,-0.681766,0.495386,399209,0.888795,-0.002208,0.003238
1,rs141242758,1,734349,T,C,-0.783376,0.433407,400370,0.888887,-0.002534,0.003235
2,rs28544273,1,751343,T,A,-0.418724,0.675419,410389,0.878942,-0.001288,0.003076
3,rs28527770,1,751756,T,C,-0.445857,0.655701,410670,0.878657,-0.001370,0.003072
4,rs3115860,1,753405,C,A,0.500818,0.616498,414911,0.127924,0.001497,0.002989
...,...,...,...,...,...,...,...,...,...,...,...
8210087,rs564838851,22,51229656,G,A,0.099266,0.920927,408150,0.930374,0.000393,0.003963
8210088,rs9616985,22,51229805,T,C,0.014857,0.988146,416448,0.927727,0.000057,0.003853
8210089,rs368226325,22,51231220,A,G,0.744332,0.456676,399839,0.957549,0.003765,0.005058
8210090,rs374914422,22,51231754,C,T,-2.246940,0.024644,398875,0.984488,-0.018544,0.008253


#### Checking for NaN, NA, inf:

In [20]:
# 1. Checking for NaN:
print(vitaD.isnull().sum())

print('\n------------------------------\n')

# 2. Checking for NA:
print(vitaD.isna().sum())

print('\n------------------------------\n')

# 3. Checking for inf:
print((vitaD.isin([np.inf, -np.inf])).sum())

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64

------------------------------

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64

------------------------------

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64


#### Checking for P=0:

In [21]:
%%bash

# Checking for P=0:
awk -v OFS='\t' 'NR == 1 || $7 == "0.0" || $7 == "0"' vita_d_onlyrs_uniq | head


SNP	CHR	BP	A1	A2	Z	P	N	FREQ	BETA	SE
rs4694423	4	72554159	C	A	49.8205	0	416480	0.583561	0.100742	0.0020221
rs4694424	4	72554228	A	G	49.8035	0	417580	0.583028	0.100616	0.00202026
rs36046344	4	72557020	G	A	51.194	0	407099	0.581061	0.104571	0.00204264
rs11733890	4	72558519	T	A	61.6867	0	406630	0.663608	0.131756	0.00213589
rs11732044	4	72566972	C	T	63.769	0	411774	0.671204	0.136062	0.00213367
rs62303864	4	72567423	C	T	63.9885	0	412987	0.670692	0.136293	0.00212996
rs12648331	4	72567704	C	T	63.9997	0	413108	0.670606	0.136295	0.00212962
rs34186014	4	72567837	T	C	64.0121	0	413281	0.670373	0.136267	0.00212877
rs56003670	4	72572154	A	C	64.4137	0	413114	0.669074	0.137004	0.00212694


#### Setting P=0 to 1e-269:

In [1]:
%%bash

awk -v OFS='\t' '$7 == "0"{$7=1e-269} 1' vita_d_onlyrs_uniq > vita_d_formatted