In [1]:
import pandas as pd
import argparse as args
import numpy as np


#  Irritabel Bowel Syndrome (IBS) formatting:

#### Reading the data:

In [3]:
ibs = pd.DataFrame(pd.read_csv("~/alzheimersproject/1_raw_data/ibs_eijsbouts", 
                delim_whitespace=True, header=0, na_values='NA'))

ibs

Unnamed: 0,variant_id,p_value,chromosome,base_pair_location,effect_allele,other_allele,effect_allele_frequency,beta,standard_error,N_CASE,N_CONTROL
0,rs367896724,0.1537,1,10177,A,AC,0.6024,0.0171,0.0120,40548,293220
1,rs201106462,0.5196,1,10352,T,TA,0.6076,0.0079,0.0123,40548,293220
2,rs575272151,0.7053,1,11008,C,G,0.9137,0.0077,0.0205,40548,293220
3,rs544419019,0.7053,1,11012,C,G,0.9137,0.0077,0.0205,40548,293220
4,rs540538026,0.3802,1,13110,A,G,0.0590,0.0238,0.0271,40548,293220
...,...,...,...,...,...,...,...,...,...,...,...
9885493,rs3896457,0.3531,22,51237063,T,C,0.7017,-0.0082,0.0088,53400,325230
9885494,rs200607599,0.2823,22,51237364,A,G,0.9847,0.0480,0.0447,40548,293220
9885495,rs370652263,0.2777,22,51237712,A,G,0.0552,0.0205,0.0189,40548,293220
9885496,rs202228854,0.9268,22,51240820,T,C,0.0264,0.0031,0.0334,40548,293220


From the table above, N and Z columns are the only missing columns.

#### Adding an Z and N columns:

In [4]:
%%bash

awk -v OFS='\t' 'BEGIN { print "SNP", "CHR", "BP", "A1", "A2", "Z", "P", "N", "FREQ", "BETA", "SE" } NR>2 { print $1, $3, $4, $5, $6, $13 = ($8/$9), $2, $12 = ($10+$11), $7, $8, $9 }' ~/alzheimersproject/1_raw_data/ibs_eijsbouts > ibs_all_cols


#### Removing potential non-rsIDs in the SNP column:

In [5]:
%%bash

awk -v OFS='\t' 'NR == 1 || $1 ~ "rs"' ibs_all_cols > ibs_onlyrs

#### Removing possible duplicates in the file:

In [1]:
%%bash

awk '!seen[$1]++' ibs_onlyrs > ibs_formatted

#### Checking the file:

In [7]:
ibs = pd.DataFrame(pd.read_csv("~/alzheimersproject/2_formatting/ibs_formatted", 
                delim_whitespace=True, header=0, na_values='NA'))

ibs

Unnamed: 0,SNP,CHR,BP,A1,A2,Z,P,N,FREQ,BETA,SE
0,rs201106462,1,10352,T,TA,0.642276,0.5196,333768,0.6076,0.0079,0.0123
1,rs575272151,1,11008,C,G,0.375610,0.7053,333768,0.9137,0.0077,0.0205
2,rs544419019,1,11012,C,G,0.375610,0.7053,333768,0.9137,0.0077,0.0205
3,rs540538026,1,13110,A,G,0.878229,0.3802,333768,0.0590,0.0238,0.0271
4,rs62635286,1,13116,T,G,1.496890,0.1349,333768,0.8114,0.0241,0.0161
...,...,...,...,...,...,...,...,...,...,...,...
9885324,rs3896457,22,51237063,T,C,-0.931818,0.3531,378630,0.7017,-0.0082,0.0088
9885325,rs200607599,22,51237364,A,G,1.073830,0.2823,333768,0.9847,0.0480,0.0447
9885326,rs370652263,22,51237712,A,G,1.084660,0.2777,333768,0.0552,0.0205,0.0189
9885327,rs202228854,22,51240820,T,C,0.092814,0.9268,333768,0.0264,0.0031,0.0334


#### Checking for NaN, NA, inf:

In [8]:
# 1. Checking for NaN:
print(ibs.isnull().sum())

print('\n------------------------------\n')

# 2. Checking for NA:
print(ibs.isna().sum())

print('\n------------------------------\n')

# 3. Checking for inf:
print((ibs.isin([np.inf, -np.inf])).sum())

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64

------------------------------

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64

------------------------------

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64


#### Checking for P=0:

In [2]:
%%bash

# Checking for P=0:
awk -v OFS='\t' 'NR == 1 || $7 == "0.0" || $7 == "0"' ibs_formatted | head


SNP	CHR	BP	A1	A2	Z	P	N	FREQ	BETA	SE
