In [1]:
import pandas as pd
import argparse as args
import numpy as np


# Smoke initiation formatting:

In [2]:
smoke = pd.DataFrame(pd.read_csv("~/alzheimersproject/1_raw_data/smoke_init_saunders", 
                delim_whitespace=True, header=0, na_values='NA'))

smoke

Unnamed: 0,CHR,POS,RSID,EFFECT_ALLELE,OTHER_ALLELE,AF_1000G,BETA,SE,P,N
0,chr10,100000235,rs11596870,T,C,0.314115,-0.00172,0.002,0.307956,805431.0
1,chr10,100000943,rs11190359,A,G,0.099404,0.00100,0.003,0.698626,805431.0
2,chr10,100000954,rs112887542,T,C,0.005964,-0.00103,0.010,0.915726,731450.0
3,chr10,100000979,rs11190360,C,T,0.054672,-0.00012,0.003,0.972102,805431.0
4,chr10,100001168,rs182851655,C,T,0.004970,-0.00807,0.012,0.513453,771446.0
...,...,...,...,...,...,...,...,...,...,...
13595214,chr9,99999166,rs4495492,C,G,0.544732,0.00289,0.002,0.162000,475758.0
13595215,chr9,99999232,rs76345285,C,T,0.018887,-0.00602,0.005,0.262000,805383.0
13595216,chr9,99999321,rs1881752,C,G,0.261431,0.00195,0.002,0.273000,805431.0
13595217,chr9,99999465,rs180967347,G,A,0.002982,0.02650,0.022,0.221000,131719.0


From the table above, Z is the only missing column. However, the data is build from hg38. 

#### Converting from hg38 to hg19:

In [4]:
%%bash

# 1. Adding CHR from reference to smoke file based on rsID:
awk 'NR==FNR { FILE1[$3]=$1; next} ($3 in FILE1) {print FILE1[$3], $0}' ~/alzheimersproject/1_raw_data/hg19_ref ~/alzheimersproject/1_raw_data/smoke_init_saunders > smoke_hg19_chr

# 1A. Removing the column that corresponds to CHR column with build38:
#awk -v OFS='\t' '{ print $1, $3, $4, $5, $6, $7, $8, $9, $10, $11 }' smoke_hg19_chr > smoke_hg19_chr_temp

# 2. Adding BP from reference to smoke file:
awk -v OFS='\t' 'NR==FNR { FILE1[$3]=$2; next} ($4 in FILE1) {print FILE1[$4], $0}' ~/alzheimersproject/1_raw_data/hg19_ref smoke_hg19_chr > smoke_hg19_chr_bp

# 2A. Removing the column that corresponds to BP column with build38: 
awk -v OFS='\t' 'BEGIN { print "CHR", "BP", "SNP", "A1", "A2", "FREQ", "BETA", "SE", "P", "N" } NR!=1 { print $2, $1, $5, $6, $7, $8, $9, $10, $11, $12}' smoke_hg19_chr_bp > smoke_hg19


In [5]:
%%bash

head smoke_hg19

CHR	BP	SNP	A1	A2	FREQ	BETA	SE	P	N
10	101760700	rs11190359	A	G	0.0994036	0.001	0.003	0.698626	805431
10	101760711	rs112887542	T	C	0.00596421	-0.00103	0.01	0.915726	731450
10	101760736	rs11190360	C	T	0.054672	-0.00012	0.003	0.972102	805431
10	101760925	rs182851655	C	T	0.00497018	-0.00807	0.012	0.513453	771446
10	101761769	rs11190362	C	T	0.0397614	0.00248	0.004	0.53783	805431
10	101761795	rs192480913	A	G	0.00994036	0.00251	0.007	0.732309	804537
10	101762057	rs111354488	G	GA	0.0397614	0.00519	0.005	0.250103	630626
10	101762087	rs147431732	A	AAAAG	0.0685885	-0.00246	0.003	0.437632	697342
10	101762093	rs201159427	A	AAGAG	0.00497018	-0.00266	0.004	0.520335	629732


#### Adding an Z column:

In [6]:
%%bash

awk -v OFS='\t' 'BEGIN { print "SNP", "CHR", "BP", "A1", "A2", "Z", "P", "N", "FREQ", "BETA", "SE" } NR>2 { print $3, $1, $2, $4, $5, $11 = ($7/$8), $9, $10, $6, $7, $8 }' smoke_hg19 > smoke_hg19_all_cols


#### Removing potential non-rsIDs in the SNP column:

In [7]:
%%bash

awk -v OFS='\t' 'NR == 1 || $1 ~ "rs"' smoke_hg19_all_cols > smoke_onlyrs

#### Removing possible duplicates in the file:

In [1]:
%%bash

awk '!seen[$1]++' smoke_onlyrs > smoke_formatted

#### Checking the file:

In [9]:
smoke = pd.DataFrame(pd.read_csv("~/alzheimersproject/2_formatting/smoke_formatted", 
                delim_whitespace=True, header=0, na_values='NA'))

smoke

  smoke = pd.DataFrame(pd.read_csv("~/alzheimersproject/2_formatting/smoke_onlyrs_uniq",


Unnamed: 0,SNP,CHR,BP,A1,A2,Z,P,N,FREQ,BETA,SE
0,rs112887542,10,101760711,T,C,-0.103000,0.915726,731450.0,0.005964,-0.00103,0.010
1,rs11190360,10,101760736,C,T,-0.040000,0.972102,805431.0,0.054672,-0.00012,0.003
2,rs182851655,10,101760925,C,T,-0.672500,0.513453,771446.0,0.004970,-0.00807,0.012
3,rs11190362,10,101761769,C,T,0.620000,0.537830,805431.0,0.039761,0.00248,0.004
4,rs192480913,10,101761795,A,G,0.358571,0.732309,804537.0,0.009940,0.00251,0.007
...,...,...,...,...,...,...,...,...,...,...,...
13519799,rs4495492,9,102761448,C,G,1.445000,0.162000,475758.0,0.544732,0.00289,0.002
13519800,rs76345285,9,102761514,C,T,-1.204000,0.262000,805383.0,0.018887,-0.00602,0.005
13519801,rs1881752,9,102761603,C,G,0.975000,0.273000,805431.0,0.261431,0.00195,0.002
13519802,rs180967347,9,102761747,G,A,1.204550,0.221000,131719.0,0.002982,0.02650,0.022


#### Checking for NaN, NA, inf:

In [10]:
# 1. Checking for NaN:
print(smoke.isnull().sum())

print('\n------------------------------\n')

# 2. Checking for NA:
print(smoke.isna().sum())

print('\n------------------------------\n')

# 3. Checking for inf:
print((smoke.isin([np.inf, -np.inf])).sum())

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64

------------------------------

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64

------------------------------

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64


#### Checking for P=0:

In [2]:
%%bash

# Checking for P=0:
awk -v OFS='\t' 'NR == 1 || $7 == "0.0" || $7 == "0"' smoke_formatted | head


SNP	CHR	BP	A1	A2	Z	P	N	FREQ	BETA	SE
