In [1]:
import pandas as pd
import argparse as args
import numpy as np


# Migraine formatting:

#### Reading the data:

In [2]:
migraine = pd.DataFrame(pd.read_csv("~/alzheimersproject/1_raw_data/migraine_hautakangas", 
                delim_whitespace=True, header=0, na_values='NA'))

migraine

  migraine = pd.DataFrame(pd.read_csv("~/alzheimersproject/1_raw_data/migraine_hautakangas",


Unnamed: 0,rs_number,chromosome,position,reference_allele,other_allele,eaf,beta,se,beta_95L,beta_95U,...,_-log10_p-value,q_statistic,q_p-value,i2,n_studies,n_samples,effects,Neff,rsid_ukbb,marker
0,1:592368,1,592368.0,G,A,0.010199,0.136408,0.299302,-0.450224,0.723040,...,0.188031,0.0,1.0,1.0,5941,??+?,1106,rs561532399,1:592368,
1,1:636285,1,636285.0,C,T,0.096043,0.037062,0.027023,-0.015904,0.090028,...,0.769011,0.0,1.0,1.0,341050,+???,15773,rs545945172,1:636285,
2,1:637420,1,637420.0,T,C,0.019151,-0.028000,0.195107,-0.410410,0.354410,...,0.052632,0.0,1.0,1.0,5941,??-?,1398,1:637420,1:637420,
3,1:649192,1,649192.0,T,A,0.117561,0.032302,0.024450,-0.015621,0.080224,...,0.729445,0.0,1.0,1.0,341050,+???,16125,rs201942322,1:649192,
4,1:657788,1,657788.0,G,C,0.219788,-0.009857,0.063584,-0.134481,0.114767,...,0.057108,0.0,1.0,1.0,5941,??-?,1442,1:657788,1:657788,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11380306,rs139273633,X,155229483.0,T,C,0.346997,-0.052996,0.022289,-0.096684,-0.009309,...,1.758328,0.0,1.0,1.0,40224,???-,8883,rs139273633,X:155229483,
11380307,rs150057047,X,155229622.0,C,G,0.341794,-0.053095,0.022362,-0.096925,-0.009265,...,1.754406,0.0,1.0,1.0,40224,???-,8889,rs150057047,X:155229622,
11380308,rs145421232,X,155229796.0,A,G,0.341067,-0.053227,0.022415,-0.097161,-0.009293,...,1.754750,0.0,1.0,1.0,40224,???-,8856,rs145421232,X:155229796,
11380309,rs144749717,X,155230932.0,G,A,0.342003,-0.053848,0.022663,-0.098267,-0.009429,...,1.756437,0.0,1.0,1.0,40224,???-,8652,rs144749717,X:155230932,


In [3]:
%%bash

head -n 1 ~/alzheimersproject/1_raw_data/migraine_hautakangas

rs_number	chromosome	position	reference_allele	other_allele	eaf	beta	se	beta_95L	beta_95U	z	p.value	_-log10_p-value	q_statistic	q_p-value	i2	n_studies	n_samples	effects	Neff	rsid_ukbb	marker


From the table above, N is the only missing column. And then chromosome X is included, which needs to be deleted.

#### Adding an N column:

In [4]:
%%bash

awk -v OFS='\t' 'BEGIN { print "SNP", "CHR", "BP", "A1", "A2", "Z", "P", "N", "FREQ", "BETA", "SE"  } NR>2 { print $21, $2, $3, $4, $5, $11, $12, $13 = (873341), $6, $7, $8 }' ~/alzheimersproject/1_raw_data/migraine_hautakangas > migraine_all_cols


#### Removing potential non-rsIDs in the SNP column and removing chromosome X:

In [5]:
%%bash

awk -v OFS='\t' 'NR == 1 || $1 ~ "rs"' migraine_all_cols > migraine_onlyrs
awk -v OFS='\t' 'NR == 1 || $2 != "X"' migraine_onlyrs > migraine_onlyrs_noX

#### Removing possible duplicates in the file:

In [1]:
%%bash

awk '!seen[$1]++' migraine_onlyrs_noX > migraine_formatted

#### Checking the file:

In [7]:
migraine = pd.DataFrame(pd.read_csv("~/alzheimersproject/2_formatting/migraine_formatted", 
                delim_whitespace=True, header=0, na_values='NA'))

migraine

Unnamed: 0,SNP,CHR,BP,A1,A2,Z,P,N,FREQ,BETA,SE
0,rs61769339,1,662622.0,A,G,1.185333,0.235865,873341,0.111610,0.027319,0.023048
1,rs12238997,1,693731.0,G,A,0.458189,0.646838,873341,0.117222,0.009883,0.021570
2,rs61769351,1,693823.0,C,G,1.233042,0.217539,873341,0.113234,0.028201,0.022871
3,rs72631875,1,705882.0,A,G,-1.640695,0.100859,873341,0.066553,-0.055090,0.033577
4,rs12029736,1,706368.0,A,G,0.159826,0.873000,873341,0.482703,0.002583,0.016164
...,...,...,...,...,...,...,...,...,...,...,...
8749403,rs5771020,22,51232581.0,C,T,0.039936,0.968130,873341,0.298334,0.000643,0.016092
8749404,rs200189535,22,51235959.0,C,T,0.997278,0.318616,873341,0.194286,0.019893,0.019947
8749405,rs200507571,22,51236013.0,I,D,-1.428243,0.153207,873341,0.251078,-0.024537,0.017180
8749406,rs3896457,22,51237063.0,C,T,-0.238732,0.811307,873341,0.299954,-0.003727,0.015613


#### Checking for NaN, NA, inf:

In [8]:
# 1. Checking for NaN:
print(migraine.isnull().sum())

print('\n------------------------------\n')

# 2. Checking for NA:
print(migraine.isna().sum())
print('\n------------------------------\n')

# 3. Checking for inf:
print((migraine.isin([np.inf, -np.inf])).sum())

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64

------------------------------

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64

------------------------------

SNP     0
CHR     0
BP      0
A1      0
A2      0
Z       0
P       0
N       0
FREQ    0
BETA    0
SE      0
dtype: int64


#### Checking P=0:

In [2]:
%%bash

# Checking for P=0:
awk -v OFS='\t' 'NR == 1 || $7 == "0.0" || $7 == "0"' migraine_formatted | head


SNP	CHR	BP	A1	A2	Z	P	N	FREQ	BETA	SE
