# Upload data and visualize
Grab the beta value and the standard error (beta) for each SNP accross each cohort.

# 044, 045 and 046 meta-analysis calculations

In [33]:
import os
import pandas as pd
import numpy as np
import math
from decimal import Decimal
from scipy.stats import chi2
import csv

"""
This function takes as input (1) a list of cohorts specific to a meta-analysis,
(2) and an excel file containing the data on each variant that was the results 
of a SNP look-up. You need to specify the name of the sheet as well. This file should contain a header. For each column, the heading should be
the cohort name & ancestry followed by a period followed by the data description. Specifically,
the data that are of interest for this script are: the beta value, standard error, and the variant
specific p-value. An example of how these entries should be in the excel sheet is

Example:
AAND_COGEND2_AA.beta_SNP_add,  AAND_COGEND2_AA.sebeta_SNP_add, AAND_COGEND2_AA.p

For (1) an example of the input list is: 
AAND_COGEND2_AA, DECODE_EA, NONGAIN_EA 

The output will be an excel file with the meta-analysis calculations for each variant of interest.
"""
# I will create a new data frame for each variant. This data frame will have the column names:
# (1) cohort, (2) Ancestry group, (3) Beta, (4) Std. Error, (5) Seweighted, and (6) Pr(>|t|) which is the p-val
# addictionally  I will add the rows for calculating the meta in the Seweighted column and below all of the cohorts
# getting the first cohort, note that this will eventually be in a loop of the cohorts

#os.chdir("C:\\Users\\jmarks\\Desktop\\Projects\\Nicotine\\GSCAN_extended_results_nicotine\\results\\results_from_missing_snp_lookup\\")
#mypath = r"C:\Users\jmarks\Desktop\Projects\Nicotine\GSCAN_extended_results_nicotine\develop\missing_snps_from_first_set_of_results\results\SNP-lookup-results"
#os.chdir(mypath)
#mydata = pd.ExcelFile("missing_SNPs_results_prefiltered_meta_analyses_044_045_046_V02.xlsx")
#mydata = mydata.parse("044_combined_data")

mypath = r"C:\Users\jmarks\Desktop\Projects\Nicotine\GSCAN_extended_results_nicotine\develop\missing_snps_from_first_set_of_results\results\SNP" 
os.chdir(mypath)
allMydata = pd.ExcelFile("mrBig.xlsx")

allCohorts = ["AAND_COGEND2_AA",
    "COGEND_AA",
    "COGEND_EA",
    "COGEND2_AA",
    "COGEND2_EA",
    "COPDGENE_AA",
    "COPDGene_EA",
    "deCODE_EA",
    "Dental_Caries_EA",
    "EAGLE_EA",
    "FINN_TWIN_EA",
    "GAIN_AA",
    "GAIN_EA",
    "JHS_AA",
    "nonGAIN_EA",
    "NTR_EA",
    "SAGE_AA",
    "SAGE_EA",
    "UW_TTURC_AA",
    "UW_TTURC_EA",
    "YALE_PENN_AA",
    "YALE_PENN_EA"]

count = 0 

excelSheets = allMydata.sheet_names
for sheet in excelSheets:
    mydata = allMydata.parse(sheet)

    cohorts_list = []
    if count == 1:
        for item in allCohorts:
            if item[-2:]=="EA":
                cohorts_list.append(item)
    elif count == 2:
        for item in allCohorts:
            if item[-2:] == "AA":
                cohorts_list.append(item)
                
                
    else:
        cohorts_list = allCohorts
        
    totalRows = len(cohorts_list)

    # This dictionary will have an rsID for the key and the key value will be a dataframe
    dataDict = {}

    # initialize a dataframe
    emptyArray = np.empty((totalRows,13,))
    emptyArray[:] = np.nan
    columns = ["SNP", "Cohort", "Ancestry group", "Beta", "Std. Error", "Seweighted", "Pr(>|t|)", 
               "AllMeta.SumSEweight", "AllMeta.weightedSE", "AllMeta.SEweighted_beta", 
               "AllMeta.SEweighted_Z", "AllMeta.SEweighted_Chi", "AllMeta.SEweighted_P"]
    num_of_rsIDs = len(mydata)
    # Above this write a script which removes the variants who were not present in any of the cohorts



    # list of SNPs which were all NA across all cohorts
    noDataSNPs = []
    

    # loop to fill in information for the meta-anlaysis calculation
    for rsID in range(num_of_rsIDs):

        # check SNP missing across all cohorts
        if not pd.isnull(mydata.iloc[rsID,3:]).all():

            markerName = mydata.iloc[rsID,0]
            dataDict[markerName] = pd.DataFrame(columns=columns, data=emptyArray)
            dataDict[markerName].iloc[0,0] = markerName # add SNP
            metaSEweighted_beta = 0


            for cohort in range(len(cohorts_list)):

                # get all of the cohort specific data
                cohortData = mydata.filter(like=cohorts_list[cohort]).iloc[rsID,:]

                # add cohort to dataframe
                cohortName = cohorts_list[cohort]
                dataDict[markerName].iloc[cohort,1] = cohorts_list[cohort][0:-3]

                # add Ancestry group
                ancestry = cohorts_list[cohort][-2:]
                dataDict[markerName].iloc[cohort, 2] = ancestry

                # add Beta
                betaVal = cohortData.filter(like=".beta")[0]

                # flip the sign for deCODE and NTR
                if cohorts_list[cohort] == "FINN_TWIN_EA":
                    betaVal = -betaVal
                dataDict[markerName].iloc[cohort, 3] = betaVal

                # add Std. Error
                standardErr = cohortData.filter(like="sebeta")[0]
                dataDict[markerName].iloc[cohort, 4] = standardErr

                # add Seweighted
                seWeighted = 1 / (standardErr ** 2)
                dataDict[markerName].iloc[cohort,5]  = seWeighted

                # add p-val
                pVal = cohortData.filter(regex=".p$")[0]
                dataDict[markerName].iloc[cohort, 6] = pVal

                #  metaSEweighted_beta calculation 
                if not np.isnan(betaVal):
                    metaSEweighted_beta += (betaVal*seWeighted)

            # Meta calculations
            SumSEweight = dataDict[markerName]['Seweighted'].sum()
            dataDict[markerName].iloc[0, 7] = SumSEweight

            metaWeightedSE = math.sqrt(1/SumSEweight)
            dataDict[markerName].iloc[0, 8] = metaWeightedSE

            metaSEweighted_beta = metaSEweighted_beta / SumSEweight 
            dataDict[markerName].iloc[0, 9] = metaSEweighted_beta

            metaSEweighted_Z = (metaSEweighted_beta / metaWeightedSE)
            dataDict[markerName].iloc[0, 10] = metaSEweighted_Z

            metaSEweighted_chi = metaSEweighted_Z ** 2
            dataDict[markerName].iloc[0, 11] = metaSEweighted_chi

            metaSEweighted_P = '%.2E' % Decimal(chi2.sf(metaSEweighted_chi, 1))
            dataDict[markerName].iloc[0, 12] = metaSEweighted_P

        else:
            noDataSNPs.append(mydata.iloc[rsID,0])
            
    count += 1



    myfile1 = mypath + "\\meta_analysis_calculation_results\\" + sheet + "_meta_results_for_missing_snps"
    # write meta-calculations for all variants to a file,
    with open(myfile1, 'a') as outfile:
        for item in dataDict:
            dataDict[item].to_csv(outfile, sep='\t', index=False)
            outfile.write('\n\n\n')


    myfile2 = mypath + "\\meta_analysis_calculation_results\\" + sheet + "_completely_missing_SNPs"
    # write the missing the missing SNPs to a file
    with open(myfile2, 'w') as outfile:
        for item in noDataSNPs:
            outfile.write("%s\n" % item)