# Upload data and visualize
I need to grab the beta value and the standard error (beta) for each SNP accross each cohort.

**Don't forget** I need to grab the minor allele and major allele of each SNP across each cohort. I will determine the orientation. Then, I will use a "majority rules" approach to determine if I need to flip the sign of the beta. Specifically, if the major and minor alleles are listed differently for a few cohorts, then I will need to flip the sign of the beta (i.e. change from negative to positive or vice versa.)

**Also** run test of code against `rs16969968`

## Sample data
This sample data is for one variant - `rs1008078` - accross all the cohorts in meta-analysis 044. I will store the results in a dictionary. The key will be the rsID and the value will be a pandas dataframe.

In [2]:
os.listdir()

['.sh',
 '044_combine_data_mrBig.tsv',
 '044_meta_analysis_mrBig_example.xlsx',
 'AAND_COGEND2_AA.sh',
 'AAND_COGEND2_AA.txt',
 'Book1.xlsx',
 'COGEND2_AA.sh',
 'COGEND2_AA.txt',
 'COGEND2_EA.sh',
 'COGEND2_EA.txt',
 'COGEND_AA.sh',
 'COGEND_AA.txt',
 'COGEND_EA.sh',
 'COGEND_EA.txt',
 'COPDGene_AA.sh',
 'COPDGene_AA.txt',
 'COPDGene_EA.sh',
 'COPDGene_EA.txt',
 'deCODE',
 'deCODE_EA.txt',
 'deCODE_EA.txt.bak',
 'Dental_Caries_EA.sh',
 'Dental_Caries_EA.txt',
 'EAGLE_EA.sh',
 'EAGLE_EA.txt',
 'FINN_TWIN_EA.sh',
 'FINN_TWIN_EA.txt',
 'GAIN_AA.sh',
 'GAIN_AA.txt',
 'GAIN_EA.sh',
 'GAIN_EA.txt',
 'JHS_AA.sh',
 'JHS_AA.txt',
 'merge.R',
 'mrBig_sample',
 'mrBig_sample.bak',
 'nonGAIN_EA.sh',
 'nonGAIN_EA.txt',
 'NTR_EA.sh',
 'NTR_EA.txt',
 'SAGE_AA.sh',
 'SAGE_AA.txt',
 'SAGE_EA.sh',
 'SAGE_EA.txt',
 'SNPlist.txt',
 'studies.json',
 'UW_TTURC_AA.sh',
 'UW_TTURC_AA.txt',
 'UW_TTURC_EA.sh',
 'UW_TTURC_EA.txt',
 'YALE_PENN_AA.sh',
 'YALE_PENN_AA.txt',
 'YALE_PENN_EA.sh',
 'YALE_PENN_EA.txt',


In [1]:
import os
import pandas as pd
import numpy as np
import math
from decimal import Decimal
from scipy.stats import chi2

# I will create a new data frame for each variant. This data frame will have the column names:
# (1) cohort, (2) Ancestry group, (3) Beta, (4) Std. Error, (5) Seweighted, and (6) Pr(>|t|) which is the p-val
# addictionally  I will add the rows for calculating the meta in the Seweighted column and below all of the cohorts
# getting the first cohort, note that this will eventually be in a loop of the cohorts

#os.chdir("C:\\Users\\jmarks\\Desktop\\Projects\\Nicotine\\GSCAN_extended_results_nicotine\\results\\results_from_missing_snp_lookup\\")
os.chdir("C:\\Users\\jmarks\\Desktop\\20180131_mrBig")
mydata = pd.ExcelFile("Book1.xlsx")
mydata = mydata.parse("Sheet1")

cohorts_list44 = ["AAND_COGEND2_AA",
"COGEND_AA",
"COGEND_EA",
"COGEND2_AA",
"COGEND2_EA",
"COPDGene_AA",
"COPDGene_EA",
"deCODE_EA",
"Dental_Caries_EA",
"EAGLE_EA",
"FINN_TWIN_EA",
"GAIN_AA",
"GAIN_EA",
"JHS_AA",
"nonGAIN_EA",
"NTR_EA",
"SAGE_AA",
"SAGE_EA",
"UW_TTURC_AA",
"UW_TTURC_EA",
"YALE_PENN_AA",
"YALE_PENN_EA"]

totalRows = len(cohorts_list44)

# This dictionary will have an rsID for the key and the key value will be a dataframe
dataDict = {}

# initialize a dataframe
emptyArray = np.empty((totalRows,13,))
emptyArray[:] = np.nan
columns = ["SNP", "Cohort", "Ancestry group", "Beta", "Std. Error", "Seweighted", "Pr(>|t|)", 
           "AllMeta.SumSEweight", "AllMeta.weightedSE", "AllMeta.SEweighted_beta", 
           "AllMeta.SEweighted_Z", "AllMeta.SEweighted_Chi", "AllMeta.SEweighted_P"]
num_of_rsIDs = len(mydata)


# Above this write a script which removes the variants who were not present in any of the cohorts




# loop to fill in information for the meta-anlaysis calculation
for rsID in range(num_of_rsIDs):
    
    markerName = mydata.iloc[rsID,0]
    dataDict[markerName] = pd.DataFrame(columns=columns, data=emptyArray)
    dataDict[markerName].iloc[0,0] = markerName # add SNP
    metaSEweighted_beta = 0
    for cohort in range(len(cohorts_list44)):
        
        # get all of the cohort specific data
        cohortData = mydata.filter(like=cohorts_list44[cohort]).iloc[rsID,:]
        
        # add cohort to dataframe
        cohortName = cohorts_list44[cohort]
        dataDict[markerName].iloc[cohort,1] = cohorts_list44[cohort][0:-3]
            
        # add Ancestry group
        ancestry = cohorts_list44[cohort][-2:]
        dataDict[markerName].iloc[cohort, 2] = ancestry
        # add Beta
        betaVal = cohortData.filter(like=".beta")[0]
        
        # flip the sign for deCODE and NTR
        if cohorts_list44[cohort] == "deCODE_EA" or cohorts_list44[cohort] == "NTR_EA":
            betaVal = -betaVal
        dataDict[markerName].iloc[cohort, 3] = betaVal
        
        # add Std. Error
        standardErr = cohortData.filter(like="sebeta")[0]
        dataDict[markerName].iloc[cohort, 4] = standardErr
        
        # add Seweighted
        seWeighted = 1 / (standardErr ** 2)
        dataDict[markerName].iloc[cohort,5]  = seWeighted
        
        # add p-val
        pVal = cohortData.filter(regex=".p$")[0]
        dataDict[markerName].iloc[cohort, 6] = pVal
        
        #  metaSEweighted_beta calculation 
        if not np.isnan(betaVal):
            metaSEweighted_beta += (betaVal*seWeighted)
      
    # Meta calculations
    SumSEweight = dataDict[markerName]['Seweighted'].sum()
    dataDict[markerName].iloc[0, 7] = SumSEweight

    metaWeightedSE = math.sqrt(1/SumSEweight)
    dataDict[markerName].iloc[0, 8] = metaWeightedSE
    
    metaSEweighted_beta = metaSEweighted_beta / SumSEweight 
    dataDict[markerName].iloc[0, 9] = metaSEweighted_beta

    metaSEweighted_Z = (metaSEweighted_beta / metaWeightedSE)
    dataDict[markerName].iloc[0, 10] = metaSEweighted_Z

    metaSEweighted_chi = metaSEweighted_Z ** 2
    dataDict[markerName].iloc[0, 11] = metaSEweighted_chi

    metaSEweighted_P = '%.2E' % Decimal(chi2.sf(metaSEweighted_chi, 1))
    dataDict[markerName].iloc[0, 12] = metaSEweighted_P



dataDict[markerName]
#dataDict["rs1008078"]
#dataDict["rs1022528"]
#dataDict[markerName].to_csv("C:\\Users\\jmarks\\Desktop\\out.file", sep='\t', index=False)itei

FileNotFoundError: [Errno 2] No such file or directory: ''

In [9]:
for key in dataDict:
    print(key)
    print(dataDict[key])
    #dataDict[key]

rs1008078
          SNP            Cohort Ancestry group      Beta  Std. Error  \
0   rs1008078   AAND_COGEND2_AA             AA  0.022789    0.038459   
1         NaN         COGEND_AA             AA -0.041462    0.064375   
2         NaN         COGEND_EA             EA -0.022469    0.026324   
3         NaN        COGEND2_AA             AA  0.022789    0.038459   
4         NaN        COGEND2_EA             EA -0.052799    0.068863   
5         NaN       COPDGene_AA             AA -0.033628    0.035596   
6         NaN       COPDGene_EA             EA  0.005601    0.022238   
7         NaN         deCODE_EA             EA  0.001000    0.011700   
8         NaN  Dental_Caries_EA             EA  0.053374    0.062319   
9         NaN          EAGLE_EA             EA  0.000272    0.019446   
10        NaN      FINN_TWIN_EA             EA  0.003087    0.021316   
11        NaN           GAIN_AA             AA -0.160170    0.076582   
12        NaN           GAIN_EA             EA  0.0266

## Need to determine what to do with NaN values in my data.
### Also, if data is NaN for all cohorts, need to remove them from my data (compile a list of these SNPs)
ask Dana how these go into calculation

In [298]:
cohorts_list44[5]

'COPDGene_AA'

In [150]:
import numpy as np
a = np.empty((29,7,))
a[:] = np.nan
print(a)
tempDF = pd.DataFrame(np.nan, index=[0,1,2,3],columns=["a","b","c"])
#tempDF

[[ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan  nan]
 [ nan  nan 

In [112]:
dataDict['rs1008078']

Unnamed: 0,SNP,Cohort,Ancestry group,Beta,Std. Error,Seweighted,Pr(>|t|)


# 044 meta-analysis

In [51]:
import os
import pandas as pd



os.chdir("C:\\Users\\jmarks\\Desktop\\Projects\\Nicotine\\GSCAN_extended_results_nicotine\\results\\results_from_missing_snp_lookup\\")


#os.listdir()
#xl = pd.ExcelFile("missing_SNPs_results_prefiltered_meta_analyses_044_045_046_V02.xlsx")
xl.sheet_names
zero44 = xl.sheet_names[0]
zero44 = xl.parse(zero44)
zero45 = xl.sheet_names[1]
zero45 = xl.parse(zero45)
zero46 = xl.sheet_names[2]
zero46 = xl.parse(zero46)
zero44.iloc[0:1,:]

# beta is in each, but also sebeta is too.

IndexError: list index out of range

# 045 meta-analysis

# 046 meta-analysis