In [1]:
import pandas as pd
import argparse as args
import numpy as np


# Sleep formatting:

#### Reading data:

In [2]:
sleep = pd.DataFrame(pd.read_csv("~/alzheimersproject/1_raw_data/sleep_dashti", 
                delim_whitespace=True, header=0, na_values='NA'))

sleep

Unnamed: 0,SNP,CHR,BP,ALLELE1,ALLELE0,A1FREQ,INFO,BETA_SLEEPDURATION,SE_SLEEPDURATION,P_SLEEPDURATION
0,rs2462492,1,54676,C,T,0.599285,0.340158,0.005999,0.003979,0.130
1,rs3107975,1,55326,T,C,0.991605,0.324228,-0.007245,0.022313,0.710
2,1:70728_C_T,1,70728,C,T,0.997843,0.365713,-0.057584,0.040239,0.140
3,rs143777184,1,79137,A,T,0.999539,0.480382,-0.044401,0.077531,0.510
4,rs114608975,1,86028,T,C,0.896397,0.340885,0.005469,0.006363,0.380
...,...,...,...,...,...,...,...,...,...,...
14661596,rs376461333,22,51232488,A,G,0.979778,0.610305,0.018816,0.010256,0.069
14661597,rs8138356,22,51234159,T,A,0.999439,0.662548,-0.038640,0.068910,0.540
14661598,rs6010092,22,51234199,T,C,0.999852,0.902878,-0.271725,0.115345,0.021
14661599,rs3896457,22,51237063,T,C,0.701560,0.852165,-0.002062,0.002673,0.460


From the table above, Z and N are the only missing columns. 

#### Adding an N and Z column:

In [3]:
%%bash

awk -v OFS='\t' 'BEGIN { print "SNP", "CHR", "BP", "A1", "A2", "Z", "P", "N", "FREQ", "BETA", "SE" } NR>2 { print $1, $2, $3, $4, $5, $11 = ($8/$9), $10, $12 = (446118), $6, $8, $9 }' ~/alzheimersproject/1_raw_data/sleep_dashti > sleep_all_cols


#### Removing potential non-rsIDs in the SNP column:

In [4]:
%%bash

awk -v OFS='\t' 'NR == 1 || $1 ~ "rs"' sleep_all_cols > sleep_onlyrs

#### Removing possible duplicates in the file:

In [1]:
%%bash

awk '!seen[$1]++' sleep_onlyrs > sleep_formatted

#### Checking the file:

In [6]:
sleep = pd.DataFrame(pd.read_csv("~/alzheimersproject/2_formatting/sleep_formatted", 
                delim_whitespace=True, header=0, na_values='NA'))

sleep

Unnamed: 0,SNP,CHR,BP,A1,A2,Z,P,N,FREQ,BETA,SE
0,rs3107975,1,55326,T,C,-0.324682,0.710,446118,0.991605,-0.007245,0.022313
1,rs143777184,1,79137,A,T,-0.572680,0.510,446118,0.999539,-0.044401,0.077531
2,rs114608975,1,86028,T,C,0.859514,0.380,446118,0.896397,0.005469,0.006363
3,rs6702460,1,91536,G,T,-0.329992,0.720,446118,0.543108,-0.001293,0.003917
4,rs8179466,1,234313,C,T,0.215745,0.810,446118,0.925393,0.001664,0.007714
...,...,...,...,...,...,...,...,...,...,...,...
14611827,rs376461333,22,51232488,A,G,1.834630,0.069,446118,0.979778,0.018816,0.010256
14611828,rs8138356,22,51234159,T,A,-0.560738,0.540,446118,0.999439,-0.038640,0.068910
14611829,rs6010092,22,51234199,T,C,-2.355760,0.021,446118,0.999852,-0.271725,0.115345
14611830,rs3896457,22,51237063,T,C,-0.771364,0.460,446118,0.701560,-0.002062,0.002673


#### Checking for NaN, NA, inf:

In [7]:
# 1. Checking for NaN:
print(sleep.isnull().sum())

print('\n------------------------------\n')

# 2. Checking for NA:
print(sleep.isna().sum())

print('\n------------------------------\n')

# 3. Checking for inf:
print((sleep.isin([np.inf, -np.inf])).sum())

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64

------------------------------

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64

------------------------------

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64


#### Checking P=0:

In [2]:
%%bash

# Checking for P=0:
awk -v OFS='\t' 'NR == 1 || $7 == "0.0" || $7 == "0"' sleep_formatted | head


SNP	CHR	BP	A1	A2	Z	P	N	FREQ	BETA	SE
