In [1]:
import pandas as pd
import argparse as args
import numpy as np


# Epilepsy formatting:

#### Reading the data:

In [2]:
epilepsy = pd.DataFrame(pd.read_csv("~/alzheimersproject/1_raw_data/epilepsy_ILAE", 
                delim_whitespace=True, header=0, na_values='NA'))

epilepsy

Unnamed: 0,CHR,BP,MarkerName,Allele1,Allele2,Freq1,FreqSE,Effective_N,Z-score,P-value,Direction,Beta,SE
0,6,130840091,rs2326918,a,g,0.8493,0.0066,64549.2,-1.321,0.186600,--+---+??,-0.010277,0.007779
1,7,145771806,rs6977693,t,c,0.8541,0.0040,63985.7,-1.294,0.195600,----+-+??,-0.010247,0.007919
2,11,100009976,rs12364336,a,g,0.8720,0.0020,64942.1,0.185,0.853200,-----++??,0.001536,0.008305
3,1,166367755,rs12562373,a,g,0.7572,0.0050,62937.9,1.379,0.168000,+-?--++??,0.009065,0.006573
4,14,86737556,rs2135099,a,g,0.1772,0.0092,65083.7,0.099,0.920800,-++-++-??,0.000719,0.007259
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4880193,10,27835123,rs10764708,a,g,0.8425,0.0024,65039.6,-1.553,0.120500,-++-+-+??,-0.011820,0.007611
4880194,20,33400913,rs7262834,t,g,0.3920,0.0020,64778.9,-2.887,0.003888,--+-+--??,-0.016428,0.005690
4880195,1,84241498,rs4140461,t,c,0.8549,0.0074,65038.7,1.197,0.231400,+-+--+-??,0.009423,0.007872
4880196,12,44074208,rs117241566,t,c,0.9380,0.0019,64937.4,0.211,0.833000,+++---+??,0.002428,0.011506


N is the only missing column.

#### Adding an N column:

In [3]:
%%bash

awk -v OFS='\t' 'BEGIN { print "SNP", "CHR", "BP", "A1", "A2", "Z", "P", "N", "FREQ", "BETA", "SE" } NR>2 { print $3, $1, $2, toupper($4), toupper($5), $9, $10, $14 = (69995), $6, $12, $13 }' ~/alzheimersproject/1_raw_data/epilepsy_ILAE > epilepsy_all_cols


#### Removing potential non-rsIDs in the SNP column:

In [4]:
%%bash

awk -v OFS='\t' 'NR == 1 || $1 ~ "rs"' epilepsy_all_cols > epilepsy_onlyrs

#### Removing possible duplicates i the file:

In [5]:
%%bash

awk '!seen[$1]++' epilepsy_onlyrs > epilepsy_onlyrs_uniq

#### Checking the file:

In [6]:
epilepsy = pd.DataFrame(pd.read_csv("~/alzheimersproject/2_formatting/epilepsy_onlyrs_uniq", 
                delim_whitespace=True, header=0, na_values='NA'))

epilepsy

Unnamed: 0,SNP,CHR,BP,A1,A2,Z,P,N,FREQ,BETA,SE
0,rs6977693,7,145771806,T,C,-1.294,0.195600,69995,0.8541,-0.010247,0.007919
1,rs12364336,11,100009976,A,G,0.185,0.853200,69995,0.8720,0.001536,0.008305
2,rs12562373,1,166367755,A,G,1.379,0.168000,69995,0.7572,0.009065,0.006573
3,rs2135099,14,86737556,A,G,0.099,0.920800,69995,0.1772,0.000719,0.007259
4,rs57502521,2,201527977,A,G,0.007,0.994700,69995,0.9649,0.000107,0.015321
...,...,...,...,...,...,...,...,...,...,...,...
4880192,rs10764708,10,27835123,A,G,-1.553,0.120500,69995,0.8425,-0.011820,0.007611
4880193,rs7262834,20,33400913,T,G,-2.887,0.003888,69995,0.3920,-0.016428,0.005690
4880194,rs4140461,1,84241498,T,C,1.197,0.231400,69995,0.8549,0.009423,0.007872
4880195,rs117241566,12,44074208,T,C,0.211,0.833000,69995,0.9380,0.002428,0.011506


#### Checking for NaN, NA, inf:

In [7]:
# 1. Checking for NaN:
print(epilepsy.isnull().sum())

print('\n------------------------------\n')

# 2. Checking for NA:
print(epilepsy.isna().sum())

print('\n------------------------------\n')

# 3. Checking for inf:
print((epilepsy.isin([np.inf, -np.inf])).sum())

SNP        0
CHR        0
BP         0
A1         0
A2         0
Z          0
P          0
N          0
FREQ       0
BETA    4724
SE      4724
dtype: int64

------------------------------

SNP        0
CHR        0
BP         0
A1         0
A2         0
Z          0
P          0
N          0
FREQ       0
BETA    4724
SE      4724
dtype: int64

------------------------------

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64


#### Checking for P=0 and beta =?:

In [31]:
%%bash

# Checking for P=0:
awk -v OFS='\t' 'NR == 1 || $7 == "0.0" || $7 == "0"' epilepsy_onlyrs_uniq | head

echo '------------------------------'

awk -v OFS='\t' 'NR == 1 || $6 == "0.000" || $6 == "-0.000"' epilepsy_onlyrs_uniq | head

echo '------------------------------'

awk -v OFS='\t' 'NR == 1 || $10 == "0" || $10 == "Inf"' epilepsy_onlyrs_uniq | head

echo '------------------------------'

awk -v OFS='\t' 'NR == 1 || $11 == "0" || $11 == "Inf"' epilepsy_onlyrs_uniq | head

SNP	CHR	BP	A1	A2	Z	P	N	FREQ	BETA	SE
------------------------------
SNP	CHR	BP	A1	A2	Z	P	N	FREQ	BETA	SE
rs12637881	3	39172642	A	G	-0.000	0.9998	69995	0.1138	0	0.00872914
rs896614	15	87307845	A	G	-0.000	0.9999	69995	0.4388	0	0.00565436
rs2292344	17	45455623	A	C	0.000	0.9997	69995	0.9852	0	0.023554
rs12124675	1	210562772	T	C	0.000	0.9997	69995	0.4348	0	0.00560127
rs61970076	13	86180214	A	G	-0.000	0.9997	69995	0.1236	0	0.00842932
rs6496305	15	86589382	A	G	0.000	0.9998	69995	0.6921	0	0.00602767
rs7746828	6	88952926	A	G	-0.000	0.9996	69995	0.7923	0	0.0068437
rs1188793	10	86240526	T	G	-0.000	0.9999	69995	0.8556	0	0.0078862
rs260931	4	29374840	A	T	-0.000	0.9997	69995	0.4034	0	0.00566746
------------------------------
SNP	CHR	BP	A1	A2	Z	P	N	FREQ	BETA	SE
rs12637881	3	39172642	A	G	-0.000	0.9998	69995	0.1138	0	0.00872914
rs896614	15	87307845	A	G	-0.000	0.9999	69995	0.4388	0	0.00565436
rs2292344	17	45455623	A	C	0.000	0.9997	69995	0.9852	0	0.023554
rs12124675	1	210562772	T	C	0.000	0.9997	69995	0.434

#### Deleting variants where beta=0:
This is done because these variants also have Z = 0, freq = 1 and p = 0.9

In [1]:
%%bash

awk -v OFS='\t' 'NR == 1 || $10 != "0"' epilepsy_onlyrs_uniq > epilepsy_formatted