# GWAS-GRPM Merger

In [None]:
#Only for Google Colab
import os
import sys

# @markdown Run in Colab virtual machine by default

# @markdown to run in google drive set:
import_mydrive = False #@param {type:"boolean"}

if 'google.colab' in sys.modules:
    if import_mydrive:
        from google.colab import drive
        drive.mount('/content/drive')
        if os.path.exists('/content/drive/MyDrive/grpm_system/'):
            %cd /content/drive/MyDrive/grpm_system/
        else:
            %mkdir /content/drive/MyDrive/grpm_system/
            %cd /content/drive/MyDrive/grpm_system/
    else:
        if os.path.exists('/content/grpm_system/'):
            %cd /content/grpm_system/
        else:
            %mkdir /content/grpm_system/
            %cd /content/grpm_system/

current_directory = os.getcwd()
print("Current working directory:", current_directory)

# Import Packages

In [None]:
#Import Modules
import os
import requests
import pandas as pd
import importlib
from datetime import datetime


def simple_bool(message):
    choose = input(message+" (y/n): ").lower()
    your_bool = choose in ["y", "yes","yea","sure"]
    return your_bool

def get_file(url, file_name, dir = os.getcwd()):
    url = url
    file_name = file_name
    response = requests.get(url)
    if response.status_code == 200:
        content = response.content
        file_path = os.path.join(dir, file_name)
        with open(file_path, 'wb') as file:
            file.write(content)

def check_and_install_module(module_name):
    try:
        # Check if the module is already installed
        importlib.import_module(module_name)
        print(f"The module '{module_name}' is already installed.")
    except ImportError:
        # If the module is not installed, try installing it
        x = simple_bool(
            "\n" + module_name + "  module is not installed.\nwould you like to install it?")
        if x:
            import subprocess
            subprocess.check_call(["pip", "install", module_name])
            print(f"The module '{module_name}' was installed correctly.")
        else:
            pass

## Get NLTK

In [None]:
# get Natural Language Toolkit https://www.nltk.org/
check_and_install_module('nltk')
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

## Get pychatgpt

In [None]:
# get & import pychatgpt (openai based module)
if simple_bool('Do you have an openai API-key?'):
    # Get pychatgpt at: https://github.com/johndef64/pychatgpt.git
    get_file(url="https://raw.githubusercontent.com/johndef64/pychatgpt/main/pychatgpt.py", file_name='pychatgpt.py')

    import pychatgpt as op
    # Example usage
    message = "Tell me about GWAS-Catalog"
    response = op.send_message_gpt(message)

else:
    print('get your api-key at https://platform.openai.com/account/api-keys\n'
          'or simply use web playground at https://platform.openai.com/playground?model=gpt-3.5-turbo-16k')

# Get requirements

## Get MESH.csv from 'bioportal.bioontology.org'

In [None]:
# Get GWAS dataset at https://www.ebi.ac.uk/gwas/docs/file-downloads

if not os.path.exists('gwas_catalog_data'):
    os.makedirs('gwas_catalog_data')

if not os.path.exists('gwas_catalog_data/gwas_catalog_v1.0.2-associations_e109_r2023-03-27.tsv'):
    get_file( url='https://www.ebi.ac.uk/gwas/api/search/downloads/alternative', file_name='gwas_catalog_v1.0.2-associations.tsv', dir = 'gwas_catalog_data')

Workflow:

1. Clean GWAS dataset
   (in STRONGEST SNP-RISK ALLEL, drop "?")
2. retrieve GRPM Survey data
3. apply GI cut-off (0.0125) on GRPM Survey
4. merge GWAS and GRPM on rsIDs
5. align GRPM-MESH vs GWAS-mapped-trait
6.  creating corrispondence dictionary
    ['PUBMED_MESH','DISEASE/TRAIT']
    through Tokenization -> Natural Language Toolkit https://www.nltk.org/
7. get the STRONGEST SNP-RISK ALLELE


# 1. Import GWAS dataset

In [None]:
#import dataset
# Download gwas_catalog_v1.0.2-associations_e109_r2023-03-27.tsv from: https://www.ebi.ac.uk/gwas/docs/file-downloads

df_gwas = pd.read_table('gwas_catalog_data/gwas_catalog_v1.0.2-associations.tsv', low_memory=False)
df_gwas[['PUBMEDID','SNP_ID_CURRENT']] = df_gwas[['PUBMEDID','SNP_ID_CURRENT']].astype(str)

df_gwas['MAPPED_GENE'] = df_gwas['MAPPED_GENE'].astype(str)  # Convert MAPPED_GENE column to string type
clean_df_gwas = df_gwas[~df_gwas['MAPPED_GENE'].str.contains('- |,')] # drop readthough transcripts

print('genes: ', clean_df_gwas['MAPPED_GENE'].nunique())
print('studies: ', len(df_gwas['STUDY'].drop_duplicates()))
print('rsid: ', len(df_gwas['SNP_ID_CURRENT'].drop_duplicates()))
df_gwas.columns

In [None]:
#display selected columns
#df_gwas['STRONGEST SNP-RISK ALLELE'].drop_duplicates()
df_gwas[['MAPPED_GENE','DISEASE/TRAIT','MAPPED_TRAIT','SNP_ID_CURRENT','STRONGEST SNP-RISK ALLELE','RISK ALLELE FREQUENCY']].drop_duplicates()

In [None]:
print('SNPs:',         df_gwas.SNPS            .nunique())
print('DISEASE/TRAIT:',df_gwas['DISEASE/TRAIT'].nunique())
print('MAPPED_TRAIT:', df_gwas['MAPPED_TRAIT'] .nunique())

df_gwas['MAPPED_TRAIT'].value_counts()#.to_csv('MAPPED_TRAIT_value_count.csv')

# 2. Filter GWAS dataset (required)

In [None]:
choose_df = clean_df_gwas # or full df_gwas

# Drop non risk/effect allele:
mask = df_gwas['STRONGEST SNP-RISK ALLELE'].str.contains("\?")
df_gwas_drop = choose_df[-mask].reset_index(drop=True)

# Drop complementary base allele (risk allele freq missing)
df_gwas_drop_nonan = df_gwas_drop.dropna(subset=['RISK ALLELE FREQUENCY'],axis=0).reset_index(drop=True)

print('Drop no risk allele:')
print('SNPs:',len(df_gwas_drop_nonan.SNPS.drop_duplicates()))
print('DISEASE/TRAIT:',len(df_gwas_drop_nonan['DISEASE/TRAIT'].drop_duplicates()))
print('MAPPED_TRAIT:',len(df_gwas_drop_nonan['MAPPED_TRAIT'].drop_duplicates()))

Full dataset:
SNPs: 267372
DISEASE/TRAIT: 21399
MAPPED_TRAIT: 7690

# - Lookup for rsid

In [None]:
# LOOKUP FOR SINGLE RSID
rsid_mask = df_gwas_drop_nonan['SNPS'].str.contains('rs1421085')
df_gwas_drop_nonan_rsid = df_gwas_drop_nonan[rsid_mask]
df_gwas_drop_nonan_rsid[['MAPPED_GENE','DISEASE/TRAIT','MAPPED_TRAIT','SNP_ID_CURRENT','STRONGEST SNP-RISK ALLELE','RISK ALLELE FREQUENCY']].drop_duplicates()
df_gwas_drop_nonan_rsid.value_counts('DISEASE/TRAIT')
df_gwas_drop_nonan_rsid.value_counts('STRONGEST SNP-RISK ALLELE')

In [None]:
# Display and to clipboard
df_gwas_drop_nonan[['MAPPED_GENE', 'DISEASE/TRAIT', 'SNPS','STRONGEST SNP-RISK ALLELE', 'RISK ALLELE FREQUENCY']].drop_duplicates()#to_clipboard()

In [None]:
df_gwas_drop_nonan_rsid[['MAPPED_GENE', 'DISEASE/TRAIT', 'SNPS','STRONGEST SNP-RISK ALLELE', 'RISK ALLELE FREQUENCY']].drop_duplicates()#.to_clipboard()

# 3. Merge GWAS and GRPMX data

## 1. choose and load GRPM survey to merge

In [None]:
# choose db
db_tag = 'pcg'
# pcg    = protein coding genes = grpm_db_pcg
# rna    = rna genes            = grpm_db_rna
# pseudo = pseudogenes          = grpm_db_pseudo
#-------------------------------------------------

survey_path = 'grpm_surveys/'

# Create an empty list to store folder names
folder_names = []
current_dir = os.getcwd()+'/'+survey_path
# Iterate over the directories in the workspace
for root, dirs, files in os.walk(current_dir):
    for dir_name in dirs:
        # Check if the folder name contains the string 'survey'
        if 'survey' in dir_name:
            folder_names.append(dir_name)

# Create a pandas Series from the list of folder names
folder_series = pd.Series(folder_names)
print('Available survey repositories:\n')
folder_series = folder_series.str.replace('grpm_survey_'+db_tag+'_','')
# Print the resulting Series
print(folder_series)

In [None]:
# load my GRPMx Data from survey folder
tag = 'nutri'
directory = survey_path+'grpm_survey_pcg_'+tag
df_grpmx = pd.read_csv(directory+'/grpmx_filtered_output.csv', index_col=0)
df_grpmx_repo = pd.read_csv(directory+'/GRPMX_report_int.csv')

# add mesh synonyms
mesh_df = pd.read_csv('ref-mesh-archive/MESH_STY_LITVAR1.csv')[['Preferred Label', 'Synonyms']]
df_grpmx = pd.merge(df_grpmx,
                            mesh_df, left_on='mesh',right_on='Preferred Label')
mesh_df = None

df_grpmx = df_grpmx.drop('Preferred Label', axis = 1)
df_grpmx['all_mesh'] = df_grpmx['mesh'] + ', ' + df_grpmx['Synonyms']


#def function: filter for int threshold:
def filter_int(df_repo, threshold ):
    df_grpmx_repo_int = df_repo[df_repo.interest_index >= threshold]
    return df_grpmx[df_grpmx.gene.isin(df_grpmx_repo_int.gene)]

#filter for 0.95 quantile
df_grpmx_95 = filter_int(df_grpmx_repo, threshold=df_grpmx_repo.interest_index.quantile(0.95))

#filter for int threshold:
df_grpmx_int = filter_int(df_grpmx_repo, threshold=0.0125)

print('df_grpmx_95 genes:', df_grpmx_95.gene.nunique())
print('df_grpmx_int genes:', df_grpmx_int.gene.nunique())

df_grpmx_int

## 2. set Gene Interest threshold

In [None]:
#threshold
#df_grpmx_th = df_grpmx_95
df_grpmx_th = df_grpmx_int

df_grpmx_int = pd.DataFrame()
df_grpmx_th_int = pd.DataFrame()

## ADD gene-interest index as common sorting handle
small_dummy = df_grpmx_repo[['gene','interest_index']]
df_grpmx_int =    pd.merge(df_grpmx,    small_dummy, left_on='gene', right_on='gene')
df_grpmx_th_int = pd.merge(df_grpmx_th, small_dummy, left_on='gene', right_on='gene')

print('GRPMX threshold Statistics:')
print('genes:', df_grpmx_th.gene.nunique())
print('rsid:',  df_grpmx_th .rsid.nunique())
print('mesh:',  df_grpmx_th .mesh.nunique())

df_grpmx_int
df_grpmx_th_int

## 3. merge gwas_df with grpmx dataset

### - complete grpmx merge

In [None]:
merge_with = clean_df_gwas # or full df_gwas

#------------------------------------------

#merge_with = df_gwas # or full df_gwas
def typestr(df):
    df[['pmids','PUBMEDID']] = df[['pmids','PUBMEDID']].astype(str)

# sort grpmx geeens by interest index:

#common handle sort
df_grpmx_int = df_grpmx_int.sort_values(by=['interest_index','rsid','mesh'], ascending =False).reset_index(drop=True)

timea = datetime.now()
print('merging data, please wait... ')
# Merge two df on rsid:
df_merged = pd.merge(df_grpmx_int,
                     merge_with, left_on='rsid', right_on='SNPS')
typestr(df_merged)

df_merged_drop = pd.merge(df_grpmx_int,
                          df_gwas_drop_nonan, left_on='rsid', right_on='SNPS')
typestr(df_merged_drop)

#rename columns:
def rename_col(df):
    return df.rename(columns={'gene':'LITVAR_GENE', 'rsid':'LIVAR_RSID', 'pmids':'LITVAR_PMID','mesh':'PUBMED_MESH'}, inplace=True)
rename_col(df_merged)
rename_col(df_merged_drop)

print('runtime:', datetime.now()-timea)
df_merged_drop[['LITVAR_GENE','MAPPED_GENE','PUBMED_MESH','all_mesh','DISEASE/TRAIT','MAPPED_TRAIT']].drop_duplicates()

In [None]:
#stats
print('Complete grpmx merge Stats\n')

print('grpmx_gene',df_grpmx_int.gene.nunique())
print('grpmx_mesh',df_grpmx_int.mesh.nunique())
print('grpmx_rsid',df_grpmx_int.rsid.nunique())

#print('\nnonan',len(df_merged_drop),', full:', len(df_merged))
df_merged_drop[['LITVAR_GENE','LIVAR_RSID','DISEASE/TRAIT','STRONGEST SNP-RISK ALLELE']].drop_duplicates()
print('df merged:')
print(df_merged_drop[['LITVAR_GENE','LITVAR_PMID','PUBMED_MESH','LIVAR_RSID','DISEASE/TRAIT','STRONGEST SNP-RISK ALLELE']].nunique())

df_merged_drop[['LITVAR_GENE','LIVAR_RSID','PUBMED_MESH','DISEASE/TRAIT','STRONGEST SNP-RISK ALLELE']].drop_duplicates()

### - threshold grpmx merge
(it's better top apply threshold downstream, skip this)

In [None]:
# threshold merge on SNPs
merge_also_nonan = True

def typestr(df):
    df[['pmids','PUBMEDID']] = df[['pmids','PUBMEDID']].astype(str)

df_grpmx_th_int = df_grpmx_th_int.sort_values(by=['interest_index','rsid','mesh'], ascending =False).reset_index(drop=True)


# Merge two df on rsid:
if merge_also_nonan == False:
    df_merged_th = pd.merge(df_grpmx_th_int,
                            df_gwas, left_on='rsid', right_on='SNPS')
    typestr(df_merged_th)
else:
    df_merged_th = pd.merge(df_grpmx_th_int,
                            df_gwas, left_on='rsid', right_on='SNPS')
    typestr(df_merged_th)

    df_merged_th_drop = pd.merge(df_grpmx_th_int,
                                 df_gwas_drop_nonan, left_on='rsid', right_on='SNPS')
    typestr(df_merged_th_drop)

#rename columns:
def rename_col(df):
    return df.rename(columns={'gene':'LITVAR_GENE', 'rsid':'LIVAR_RSID', 'pmids':'LITVAR_PMID','mesh':'PUBMED_MESH'}, inplace=True)
rename_col(df_merged_th_drop)
rename_col(df_merged_th)

print('genes merged:', df_merged_th_drop.LITVAR_GENE.nunique())
df_merged_th_drop

In [None]:
#stats
print('Threshold grpmx merge Statistics:\n')
print('grpmx_gene',df_grpmx_th_int.gene.nunique())
print('grpmx_mesh',df_grpmx_th_int.mesh.nunique())
print('grpmx_rsid',df_grpmx_th_int.rsid.nunique())

print('\nfull threshold merge pmids:',df_merged_th.LITVAR_PMID.nunique(),
      'threshold pmids:',df_merged_th_drop.LITVAR_PMID.nunique())
#df_merged_95_drop[['gene','rsid','mesh','DISEASE/TRAIT','STRONGEST SNP-RISK ALLELE']].drop_duplicates()
print('\ngrpm merged with gwas stats:')
print(df_merged_th_drop[['LITVAR_GENE','LITVAR_PMID','PUBMED_MESH','LIVAR_RSID','DISEASE/TRAIT','STRONGEST SNP-RISK ALLELE']].nunique())

In [None]:
import matplotlib.pyplot as plt

def visualize_unique_values(df):
    unique_counts = df.nunique()  # Calculate the number of unique values for each column
    plt.figure(figsize=(8,3))
    unique_counts.plot(kind='bar')  # Create a bar plot
    plt.xlabel('Columns')
    plt.ylabel('Unique Values')
    plt.title('Number of Unique Values per Column')
    plt.show()
visualize_unique_values(df_merged_th_drop)

In [None]:
# complete grpmx merge stats:

# value counts:
print('GWAS TRAIT count in entire GRPM survey')
df_merged.value_counts('DISEASE/TRAIT')

In [None]:
print('GWAS TRAIT count (nonan) in entire grpmx')
df_merged_drop.value_counts('DISEASE/TRAIT')

In [None]:
#df_merged_drop_nonan.to_csv('gwas_catalog_data/df_merged_drop_nonan.csv') # heavy file!
def df_usage(df):
    return (df.memory_usage()/1048576).sum()

print(df_usage(df_merged))
print(df_usage(df_merged_th_drop))

## 4. Filter for lexical match

### - create correspondence GWAS-GRPM df
['PUBMED_MESH','DISEASE/TRAIT']

In [None]:
print(df_merged_drop[['LITVAR_GENE','MAPPED_GENE']].nunique())
print('')
# choosing 'DISEASE/TRAIT' or 'MAPPED_TRAIT'
print(df_merged_drop[['DISEASE/TRAIT','MAPPED_TRAIT','PUBMED_MESH' ]].nunique())
df_merged_drop[['DISEASE/TRAIT','MAPPED_TRAIT' ]].drop_duplicates()

In [None]:
# create correspondence GWAS-GRPM df
mesh_col = 'all_mesh'
corr_df_all = df_merged_drop[[mesh_col,'MAPPED_TRAIT' ]].drop_duplicates().reset_index(drop= True).dropna()
print('PUBMED_MESH',   corr_df_all[mesh_col].nunique())
print('MAPPED_TRAIT' , corr_df_all['MAPPED_TRAIT' ].nunique())
print('rows' ,     len(corr_df_all))
#print(corr_df.sort_values(by= mesh_col))

mesh_col = 'PUBMED_MESH'
corr_df = df_merged_drop[[mesh_col,'MAPPED_TRAIT' ]].drop_duplicates().reset_index(drop= True)
print('\nPUBMED_MESH',   corr_df[mesh_col].nunique())
print(  'MAPPED_TRAIT' , corr_df['MAPPED_TRAIT' ].nunique())
print(  'rows' ,         len(corr_df))

In [None]:
corr_df.sort_values(by= 'PUBMED_MESH')

In [None]:
#corr_df[mesh_col].drop_duplicates()
df_merged_drop
#mesh  363

### - build a common Dictionary

In [None]:
mesh_col = 'PUBMED_MESH'
meshes = corr_df.sort_values(by= mesh_col)[mesh_col].drop_duplicates()
mesh_corr_df = corr_df[corr_df[mesh_col] == meshes.iloc[100]]

# Correspondence through AI  (trial)
if simple_bool('Try Correspondence through AI?'):
    import pychatgpt as op
    mess = "analyze and filter the csv below, creating another csv keeping only the rows where the 'PUBMED_MESH' and 'MAPPED_TRAIT' are the exact same biological entity even though it could be written differently:\n\n"+mesh_corr_df.to_csv(index=None)
    op.ask_gpt(mess)#, maxtoken=1500)
    mesh_corr_df

### - through Tokenization (nltk)

In [None]:
# Correspondence dictionary through Tokenization:

from nltk.tokenize import word_tokenize

# choose df:
df = corr_df_all
mesh_col = 'all_mesh'

if False: #(pseudo-code)
    # Function to tokenize a string into individual words
    def tokenize_string(text):
        return set(word_tokenize(text.lower()))

    # Filter the DataFrame based on the condition that the intersection of tokenized "PUBMED MESH" and "MAPPED_TRAIT" is not empty
    filtered_df = df[df.apply(lambda row: bool(tokenize_string(row[mesh_col]) & tokenize_string(row['MAPPED_TRAIT'])), axis=1)]

    tokenize_string(corr_df[mesh_col][2])
    n =55
    bool(tokenize_string(corr_df[mesh_col][n]) & tokenize_string(corr_df['MAPPED_TRAIT'][n]))
    print(tokenize_string_trial(corr_df[mesh_col][n]))
    print(tokenize_string(corr_df[mesh_col][n]))
#-------------

print('tokenization in progress, please wait...')
# choosen token TAGs:
def tokenize_string_trial(text):
    tokens = set(word_tokenize(text.lower()))
    tagged_tokens = nltk.pos_tag(tokens)
    desired_tags = ['NN', 'NNS', 'JJ']
    filtered_tokens = [token for token, pos in tagged_tokens if pos in desired_tags]
    return set(filtered_tokens)

timea = datetime.now()
filtered_df = df[df.apply(lambda row: bool(tokenize_string_trial(row[mesh_col]) & tokenize_string_trial(row['MAPPED_TRAIT'])), axis=1)]
timeb= datetime.now()
print('runtime:', timeb-timea)

filtered_df

## 5. merging Dictionay to GWAS_GRPM_df

In [None]:
df_merged_drop.all_mesh
filtered_df.all_mesh

In [None]:
# filtering merge:
mesh_col = 'all_mesh'
merge_grpm_gwas_fliter = df_merged_drop.merge(filtered_df, on=[mesh_col,'MAPPED_TRAIT'])
df_show = merge_grpm_gwas_fliter[['LITVAR_GENE','LIVAR_RSID', 'LITVAR_PMID', 'PUBMED_MESH', 'Synonyms','interest_index','MAPPED_GENE','PUBMED_MESH','DISEASE/TRAIT','MAPPED_TRAIT', 'STRONGEST SNP-RISK ALLELE','P-VALUE', 'OR or BETA']].drop_duplicates().reset_index(drop= True)
df_show = df_show.loc[:, ~df_show.columns.duplicated()]
df_show

In [None]:
df_show.to_csv('gwas_catalog_data/merge_grpm_gwas_fliter_nutri_0725.csv')
pd.read_csv('gwas_catalog_data/merge_grpm_gwas_fliter_nutri_0725.csv')

#statistics
print(df_show[['LITVAR_GENE','LIVAR_RSID','LITVAR_PMID','PUBMED_MESH','MAPPED_TRAIT','DISEASE/TRAIT','STRONGEST SNP-RISK ALLELE']].nunique())

### apply GI threshold

In [None]:
# filtering fot threshold GI:
threshold = 0.0125
df_show_th = df_show[df_show.interest_index >= threshold]

df_show_th.to_csv('gwas_catalog_data/merge_grpm_gwas_fliter_nutri_th0136.csv')
print('merged GWAS-GRPMX threshold:', threshold,'\n')
print(df_show_th[['LITVAR_GENE','MAPPED_GENE','PUBMED_MESH','MAPPED_TRAIT','DISEASE/TRAIT','STRONGEST SNP-RISK ALLELE']].nunique())

df_thr_short= df_show_th[['LITVAR_GENE','LIVAR_RSID','LITVAR_PMID','PUBMED_MESH','MAPPED_TRAIT','DISEASE/TRAIT','STRONGEST SNP-RISK ALLELE']]
df_thr_short#.to_csv(r'file_name.csv')
df_show_th.columns

In [None]:
df_thr_short.drop_duplicates(subset='LITVAR_GENE').sample(frac=1)#.to_csv('file_name.csv')

In [None]:
df_grpmx_int#.gene.nunuque()
df_merged_drop

# 4. Scoping GWAS Dataset (general)

## Define Mother Dataframe:

In [None]:
df_merged_drop

In [None]:
# LOOKUP FOR SINGLE RSID
rsid_mask = df_merged_drop_nonan['LIVAR RSID'].str.contains('rs1421085')
df_merged_drop_nonan_rsid = df_merged_drop_nonan[rsid_mask]

In [None]:
# Display and to clipboard
df_merged_drop[['LITVAR_GENE','LIVAR RSID','MAPPED_GENE','PUBMED_MESH', 'DISEASE/TRAIT', 'STRONGEST SNP-RISK ALLELE', 'RISK ALLELE FREQUENCY']].drop_duplicates()
df_merged_drop_nonan_rsid[['LITVAR_GENE','LIVAR RSID','MAPPED_GENE','PUBMED_MESH', 'DISEASE/TRAIT', 'STRONGEST SNP-RISK ALLELE', 'RISK ALLELE FREQUENCY']].drop_duplicates()#.to_clipboard()

In [None]:
df_merged_drop_nonan_rsid['PUBMED_MESH'].drop_duplicates()

## Get risk Allele list and use it to filter mother table

In [None]:
df_merged_drop_nonan.value_counts('SNPS')

In [None]:
#df_merged_drop_rsid_nonan[['STRONGEST SNP-RISK ALLELE','PUBMED_MESH']].groupby('STRONGEST SNP-RISK ALLELE').describe().reset_index()
rsid_mask = df_merged_drop_nonan['LIVAR RSID'].str.contains('rs1421085')
df_merged_drop_nonan_rsid = df_merged_drop_nonan[rsid_mask]
df_merged_drop_nonan_rsid[['LITVAR_GENE','LIVAR RSID','MAPPED_GENE','PUBMED_MESH', 'DISEASE/TRAIT', 'STRONGEST SNP-RISK ALLELE', 'RISK ALLELE FREQUENCY']].drop_duplicates()
type(df_merged_drop_nonan_rsid['STRONGEST SNP-RISK ALLELE'].value_counts())#.head(1))
risk_allele = df_merged_drop_nonan_rsid['STRONGEST SNP-RISK ALLELE'].value_counts()#.index[0]
risk_allele

In [None]:
#---> creare una lista programmaticamente di tutti i 'risk allele' by count and use it to filter mother dataframe with isin module!

# get all rsid list
rsid_list = df_merged_drop_nonan['LIVAR RSID'].drop_duplicates().to_list()
len(rsid_list)

In [None]:
# risk allele pickup (part1)
time_start = datetime.now()
risk_allele_list = []
for i in rsid_list[:2000]:
    rsid_mask = df_merged_drop_nonan['LIVAR RSID'].str.contains(i)
    df_merged_drop_nonan_rsid = df_merged_drop_nonan[rsid_mask]
    risk_allele = df_merged_drop_nonan_rsid['STRONGEST SNP-RISK ALLELE'].value_counts().index[0]
    risk_allele_list.append(risk_allele)
    #print(str(risk_allele))
finish_start = datetime.now()
pd.Series(risk_allele_list).to_csv('gwas_catalog_data/risk_allele_list_0-2000.csv')

In [None]:
# risk allele pickup (part2)
for i in rsid_list[2000:]:
    rsid_mask = df_merged_drop_nonan['LIVAR RSID'].str.contains(i)
    df_merged_drop_nonan_rsid = df_merged_drop_nonan[rsid_mask]
    risk_allele = df_merged_drop_nonan_rsid['STRONGEST SNP-RISK ALLELE'].value_counts().index[0]
    risk_allele_list.append(risk_allele)
    #print(str(risk_allele))
finish_start = datetime.now()
print(finish_start - time_start)
pd.Series(risk_allele_list).to_csv('gwas_catalog_data/risk_allele_list_0-2000.csv')

In [None]:
# import back risk allele list

risk_allele_df = pd.read_csv('gwas_catalog_data/risk_allele_list_4376.csv', index_col=0)
risk_allele_list = risk_allele_df['0'].to_list()
risk_allele_list
# --> ora filtrare Mother Df per i risk allele and ...BAM!
risk_allele_df

In [None]:
# genera la merged  with GRPMX dropped!
#df_merged_drop_nonan = pd.read_csv('df_merged_drop_nonan.csv', index_col=0)

df_merged_drop_less_gwa = df_merged_drop_nonan[['MAPPED_GENE','SNPS', 'DISEASE/TRAIT', 'STRONGEST SNP-RISK ALLELE', 'RISK ALLELE FREQUENCY']].drop_duplicates()
#df_merged_drop_less_gwa.to_csv('df_merged_drop_less_gwa.csv')#.SNPS.drop_duplicates()
df_merged_drop_less_gwa.SNPS.drop_duplicates().sample(10)

In [None]:
type(df_merged_drop_nonan_rsid['STRONGEST SNP-RISK ALLELE'].value_counts())#.head(1))
risk_allele = df_merged_drop_nonan_rsid['STRONGEST SNP-RISK ALLELE'].value_counts()#.index[0]
risk_allele

In [None]:
#qualti sono monorischio??
rsid_list = df_merged_drop_less_gwa.SNPS.drop_duplicates().to_list()
time_start = datetime.now()
monorisk_list = []

for i in rsid_list:
    rsid_mask = df_merged_drop_less_gwa['SNPS'].str.contains(i)
    df_merged_drop_less_gwa_rsid = df_merged_drop_less_gwa[rsid_mask].drop_duplicates()
    risk_allele = df_merged_drop_less_gwa_rsid['STRONGEST SNP-RISK ALLELE'].value_counts()#.index[0]
    if len(risk_allele)==1:
            monorisk = risk_allele.index[0]
            monorisk_list.append(monorisk)
            print(risk_allele)

In [None]:
#print(risk_allele)
pd.Series(monorisk_list)#.to_csv('gwas_catalog_data/monorisk_list.csv')
#len(rsid_list)

In [None]:
# risk allele pickup: ricerca dei valori di conteggio ambigui------------------

rsid_list = df_merged_drop_less_gwa.SNPS.drop_duplicates().to_list()
time_start = datetime.now()
ambiguity_list = []

for i in rsid_list:
    rsid_mask = df_merged_drop_less_gwa['SNPS'].str.contains(i)
    df_merged_drop_less_gwa_rsid = df_merged_drop_less_gwa[rsid_mask].drop_duplicates()
    risk_allele = df_merged_drop_less_gwa_rsid['STRONGEST SNP-RISK ALLELE'].value_counts()#.index[0]
    if len(risk_allele)>1:
        if risk_allele[0] == risk_allele[1]:
            ambiguity = risk_allele.index[0]
            ambiguity_list.append(ambiguity)
            print(risk_allele)

print(risk_allele)
ambiguity_list

In [None]:
ambiguity_list = []
for i in rsid_list:
    rsid_mask = df_merged_drop_less_gwa['SNPS'].str.contains(i)
    df_merged_drop_less_gwa_rsid = df_merged_drop_less_gwa[rsid_mask].drop_duplicates()
    risk_allele = df_merged_drop_less_gwa_rsid['STRONGEST SNP-RISK ALLELE'].value_counts()#.index[0]
    if len(risk_allele)>1:
        if risk_allele[0] == risk_allele[1]:
            ambiguity = risk_allele.index[0], risk_allele[0]
            ambiguity_list.append(ambiguity)
            #print(risk_allele)
ambiguity_df = pd.DataFrame(ambiguity_list)

In [None]:
#ambiguity_df.to_csv('gwas_catalog_data/ambiguity_magg1_df.csv')
pd.read_csv('gwas_catalog_data/ambiguity_1_list.csv')

In [None]:
ambiguity_df

In [None]:
ambiguity_df.groupby(by=1).describe().to_clipboard()

## remove ambiguity

In [None]:
risk_allele_df#[0]
df_merged_drop_less_gwa
type(risk_allele_df.iloc[:,0])

In [None]:
# select just ambiugous count > 1
ambiguous_magg1 = pd.read_csv('gwas_catalog_data/ambiguity_magg1_df.csv', index_col=0)
# Only GWAS
mask = df_merged_drop_less_gwa['SNPS'].isin(ambiguous_magg1.rsid)
df_merged_drop_less_gwa_ambmagg1 = df_merged_drop_less_gwa[mask]
df_merged_drop_less_gwa_ambmagg1#.to_csv('gwas_catalog_data/df_merged_drop_less_gwa_ambmagg1.csv')

In [None]:
#GRPMX-GWAS
mask = df_merged_drop_nonan['SNPS'].isin(ambiguous_magg1.rsid)
df_merged_drop_nonan_ambmagg1 = df_merged_drop_nonan[mask]
df_merged_drop_nonan_ambmagg1[['LITVAR_GENE','PUBMED_MESH','DISEASE/TRAIT','STRONGEST SNP-RISK ALLELE','RISK ALLELE FREQUENCY','MAPPED_GENE']]#.to_csv('gwas_catalog_data/df_merged_drop_less_gwa_ambmagg1.csv')

In [None]:
df_merged_drop_nonan[['LITVAR_GENE','PUBMED_MESH','DISEASE/TRAIT','STRONGEST SNP-RISK ALLELE','RISK ALLELE FREQUENCY','MAPPED_GENE']]
#quanti sono monorisk allele?

In [None]:
# before filter mother df with "riskallelelist

# Only GWAS
mask = df_merged_drop_less_gwa['STRONGEST SNP-RISK ALLELE'].isin(risk_allele_df.iloc[:,0])
df_merged_drop_less_gwa_riskall = df_merged_drop_less_gwa[mask]
#df_merged_drop_less_gwa_riskall.to_csv('gwas_catalog_data/df_merged_drop_less_gwa_riskall.csv')
df_merged_drop_less_gwa_riskall

In [None]:
# GRPMX-GWAS merged
mask2 = df_merged_drop_nonan['STRONGEST SNP-RISK ALLELE'].isin(risk_allele_df.iloc[:,0])
df_merged_drop_nonan_riskall = df_merged_drop_nonan[mask2]
df_merged_drop_nonan_riskall.to_csv('gwas_catalog_data/df_merged_drop_nonan_riskall.csv')

In [None]:
# then remove ambiugous rsids

# GRPMX-GWAS merged
ambiguous_rsids = pd.read_csv('gwas_catalog_data/ambiguous_rsids.csv')
mask_amb = df_merged_drop_nonan_riskall.SNPS.isin(ambiguous_rsids['ambiguous_rsids'])
df_merged_drop_nonan_riskall_unamb = df_merged_drop_nonan_riskall[-mask_amb]

In [None]:
df_merged_drop_nonan_riskall_unamb.columns

In [None]:
df_merged_drop_nonan_riskall_unamb[['LITVAR_GENE', 'LIVAR RSID', 'LITVAR PMID', 'PUBMED_MESH', 'PUBMEDID', 'DISEASE/TRAIT','MAPPED_GENE','STRONGEST SNP-RISK ALLELE', 'SNPS','RISK ALLELE FREQUENCY', 'P-VALUE','OR or BETA']][300:325].to_clipboard(sep=',')

In [None]:
# Only GWAS
mask_amb = df_merged_drop_less_gwa_riskall.SNPS.isin(ambiguous_rsids['ambiguous_rsids'])
df_merged_drop_less_gwa_riskall_unamb = df_merged_drop_less_gwa_riskall[-mask_amb]
df_merged_drop_less_gwa_riskall_unamb.to_csv('gwas_catalog_data/df_merged_drop_less_gwa_unambiguous.csv')

In [None]:
df_merged_drop_less_gwa_riskall_unamb[300:325].to_clipboard(sep=',')

In [None]:
len(df_merged_drop_less_gwa_riskall_unamb.SNPS.drop_duplicates()), len(df_merged_drop_nonan.SNPS.drop_duplicates())
unambiguous_rsids = df_merged_drop_less_gwa_riskall_unamb.SNPS.drop_duplicates().reset_index(drop=True)
#unambiguous_rsids.to_csv('gwas_catalog_data/unambiguous_rsids.csv')
#df_merged_drop_nonan_unamb.to_csv('gwas_catalog_data/df_merged_drop_nonan_unambiguous.csv')

In [None]:
df_merged_drop_less_gwa_unamb

In [None]:
# LOOKUP FOR SINGLE RSID
rsid_mask = df_merged_drop_less_gwa['SNPS'].str.contains('rs1558902')
df_merged_drop_less_gwa_rsid = df_merged_drop_less_gwa[rsid_mask]
df_merged_drop_less_gwa_rsid.to_clipboard()

In [None]:
# LOOKUP FOR SINGLE ALLELE
allele_mask = df_merged_drop_nonan_rsid['STRONGEST SNP-RISK ALLELE'].str.contains('-C')
df_merged_drop_rsid_allele = df_merged_drop_nonan_rsid[allele_mask]
df_merged_drop_rsid_allele[['LITVAR_GENE','LIVAR RSID','MAPPED_GENE','PUBMED_MESH', 'DISEASE/TRAIT', 'STRONGEST SNP-RISK ALLELE', 'RISK ALLELE FREQUENCY']].drop_duplicates()

# Trials

## Tokenizing (Trial)

In [None]:
# Tokenizing (Trial)
import nltk
def tokenize_string_trial(text):
    tokens = set(word_tokenize(text.lower()))
    tagged_tokens = nltk.pos_tag(tokens)
    desired_tags = ['NN', 'NNS', 'JJ','VB']
    filtered_tokens = [token for token, pos in tagged_tokens if pos in desired_tags]
    return set(filtered_tokens)

sentence = """The adipokines, or adipocytokines (Greek adipo-, fat; cytos-, cell; and -kinos, movement) are cytokines (cell signaling proteins) secreted by adipose tissue. Some contribute to an obesity-related low-grade state of inflammation or to the development of metabolic syndrome, a constellation of diseases including, but not limited to, type 2 diabetes, cardiovascular disease and atherosclerosis.[1] The first adipokine to be discovered was leptin in 1994.[2] Since that time, hundreds of adipokines have been discovered.[3]"""
tokens = nltk.word_tokenize(corr_df['PUBMED_MESH'][5].lower())
tokens = nltk.word_tokenize(sentence)

#nltk.download('averaged_perceptron_tagger')
tagged = nltk.pos_tag(tokens)
tagg= pd.DataFrame(tagged[0:])

#print(tagg.groupby(by=1 ).describe())
tagg[tagg[1]== 'VBD']
print(type(tagged))

print('\n',tokenize_string_trial(sentence))
print('\n',tokenize_string(sentence))
#tagg[1].drop_duplicates().to_list()

In [None]:
#!pip install svgling
#nltk.download('maxent_ne_chunker')
#nltk.download('words')
entities = nltk.chunk.ne_chunk(tagged)
type(entities)
entities