# Query NBIB full dataset of LitVar-publications
Source dataset:  medrxiv DOI: 10.1101/2023.08.04.23293659/nbib_data.csv 

impact factor
https://www.annualreviews.org/action/showPublications

# Import Packages

In [2]:
import os
import io
import sys
import zipfile
import requests
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
if not 'google.colab' in sys.modules:
    import pyperclip

def simple_bool(message):
    choose = input(message+" (y/n): ").lower()
    your_bool = choose in ["y", "yes","yea","sure"]
    return your_bool

def get_file(url, file_name, dir = os.getcwd()):
    url = url
    file_name = file_name
    response = requests.get(url)
    if response.status_code == 200:
        content = response.content
        file_path = os.path.join(dir, file_name)
        with open(file_path, 'wb') as file:
            file.write(content)

def get_and_extract(file, dir = os.getcwd(), ext = '.zip'):
    url='https://zenodo.org/record/8205724/files/'+file+'.zip?download=1'
    zip_file_name = file+ext
    extracted_folder_name = dir
    # Download the ZIP file
    response = requests.get(url)
    if response.status_code == 200:
        # Extract the ZIP contents
        with io.BytesIO(response.content) as zip_buffer:
            with zipfile.ZipFile(zip_buffer, 'r') as zip_ref:
                zip_ref.extractall(extracted_folder_name)
        print(f"ZIP file '{zip_file_name}' extracted to '{extracted_folder_name}' successfully.")
    else:
        print("Failed to download the ZIP file.")

import pychatgpt as op

# Query the LitVar-NBIB Dataset

## Get nbib-data from Zenodo

In [3]:
if simple_bool('Download nbib-data from Zenodo?\n (size: 5GB unpacked; average import time: 7'')'):
    timea = datetime.now()
    get_and_extract('nbib_data')
    print('Download and extraction time ',datetime.now()-timea)

ZIP file 'nbib_data.zip' extracted to 'G:\Altri computer\Razor\Nutrigenetica\Bioinformatics\GRPMX - Razor\GRPMX db - Razor' successfully.
Download and extraction time  0:07:25.321202


## Import full nbib Dataset

In [4]:
# set source dataset:-----------------------
db_tag = 'pcg'
db_name = 'grpm_db_' + db_tag
db_path = 'grpm_dataset/'+db_name

time1 = datetime.now()

# import gene-fullnbib
dummy_nbib = pd.read_csv(db_path+'/complete_nbibtable.csv', index_col=0)
dummy_nbib['pubmed_id'] = dummy_nbib['pubmed_id'].astype(str)
time2 = datetime.now()
print('time import nbib: ', time2-time1)
print(dummy_nbib.memory_usage().sum() / 1024 / 1024, 'MB')

time import nbib:  0:01:30.121373
205.5575942993164 MB


## Define query targets 

### Keywords:

#### Load a keyword set

In [72]:
# use case
url = "https://raw.githubusercontent.com/johndef64/GRPM_system/main/ref-mesh-archive/ref_keywords_genetic-disease-therapy.csv"
get_file(url, 'ref_keywords_genetic-disease-therapy.csv', dir = "ref-mesh-archive\\")

ref_kw = pd.read_csv(r'ref-mesh-archive\ref_keywords_genetic-disease-therapy.csv')
ref_kw

Unnamed: 0.1,Unnamed: 0,keywords,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10,Col11,Col12,Col13
0,39360,"ERT, Enzyme Replacement Therapy",2,2,GZMM,1,2,1,32382504,2,2,1,"[{'author': 'Piraud, Monique', 'author_abbrevi...",2.0
1,41118,Enzyme Replacement Therapy,2,2,GAA,1,2,2,27183828,1,2,2,"[{'author': 'Peng, Steven Shinn-Forng', 'autho...",1.0
2,23210,CRISPR-Cas9,188,158,C2,4,188,99,33109263,12,188,100,"[{'author': 'Eguizabal, C', 'author_abbreviate...",12.0
3,23211,CRISPR-Cas9 deletion,2,2,SKAP1,1,2,1,33815481,2,2,1,"[{'author': 'Wang, Wei', 'author_abbreviated':...",2.0
4,23214,CRISPR-Cas9 gene editing,3,3,SYN3,1,3,2,34929159,2,3,2,"[{'author': 'Engel, Abbi L', 'author_abbreviat...",2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,51199,Genomic sequencing,10,10,BRCA2,1,10,2,27077130,9,10,2,"[{'author': 'Jamuar, Saumya Shekhar', 'author_...",9.0
61,78345,Molecular diagnosis,219,209,GLA,3,219,46,28912962,49,219,46,"[{'author': 'Costa, Kárita Antunes', 'author_a...",49.0
62,50658,Genetic diagnosis,123,113,GJB2,2,123,41,31203817,19,123,41,"[{'author': 'Arts, Peer', 'author_abbreviated'...",19.0
63,50659,Genetic diagnosis panel,2,2,PCCA,1,2,1,30186825,2,2,1,"[{'author': 'Wang, Yanyun', 'author_abbreviate...",2.0


In [7]:
# ref_kw cleaner

# List of values to remove
values_to_remove = ['CRISP',
                    'Enzyme',
                    'NGS',
                    'Carrier',
                    'Next generation sequencing',
                    'sequencing',
                    'Sequencing',
                    'Array']

# Remove values from the 'col1' column that contain values_to_remove
ref_kw_less = ref_kw[~ref_kw['keywords'].str.contains('|'.join(values_to_remove))]
ref_kw_less.reset_index(drop=True, inplace=True)
ref_kw_less.keywords.to_csv(r'ref-mesh-archive\ref_keywords_genetic-disease-therapy_clean.csv')

#### explode keywords

In [8]:
import ast #abstract syntax tree
#ast.literal_eval(dummy_nbib.keywords[0])[0]

cols = ['gene', 'pubmed_id','keywords','authors']
print(len(dummy_nbib[cols]), len(dummy_nbib[cols].drop_duplicates()))

small_dummy = dummy_nbib[cols].drop_duplicates()
small_dummy.dropna(subset='keywords', inplace=True)
small_dummy['keywords'] = small_dummy['keywords'].apply(ast.literal_eval)

728185 700659


#### generate GPKA

In [9]:
type(small_dummy.keywords[0])
small_explosion = small_dummy.explode('keywords')
small_explosion

Unnamed: 0,gene,pubmed_id,keywords,authors
0,MT-ND1,29980632,Leber Hereditary Optic Neuropathy (LHON),"[{'author': 'Hirano, Michio', 'author_abbrevia..."
0,MT-ND1,29980632,MELAS syndrome,"[{'author': 'Hirano, Michio', 'author_abbrevia..."
0,MT-ND1,29980632,mitochondrial diseases,"[{'author': 'Hirano, Michio', 'author_abbrevia..."
0,MT-ND1,29980632,mitophagy,"[{'author': 'Hirano, Michio', 'author_abbrevia..."
2,MT-ND1,31996241,Leigh syndrome,"[{'author': 'Schubert Baldo, Manuela', 'author..."
...,...,...,...,...
0,CRISP1,31230945,Interaction promiscuity,"[{'author': 'Liu, Weifeng', 'author_abbreviate..."
0,CRISP1,31230945,T cell costimulation and coinhibition,"[{'author': 'Liu, Weifeng', 'author_abbreviate..."
0,CRISP1,31230945,X-ray structures,"[{'author': 'Liu, Weifeng', 'author_abbreviate..."
0,CRISP1,31230945,immune regulation,"[{'author': 'Liu, Weifeng', 'author_abbreviate..."


In [31]:
if simple_bool('save extracted dataset?'):
    small_explosion.to_csv(r'grpm_dataset\grpm_db_pcg\GPKA_dataset.csv')

In [10]:
timea = datetime.now()
gpka_bykw = small_explosion.groupby('keywords').describe()
print(datetime.now() - timea) # it takes too much time
gpka_bykw

KeyboardInterrupt: 

#### Filtering kw

In [12]:
ref_kw_use = ref_kw_less

small_explosion_filter  = small_explosion[small_explosion.keywords.isin(ref_kw_use.keywords)]
# restet index
small_explosion_filter.reset_index(drop= True, inplace=True)
# convert authors to list
small_explosion_filter['authors'] = small_explosion_filter['authors'].apply(ast.literal_eval)

In [14]:
small_explosion_filter.describe()

Unnamed: 0,gene,pubmed_id,keywords,authors
count,2584,2584,2584,2584
unique,1636,699,36,705
top,CFTR,31872004,Precision medicine,"[{'author': 'Totomoch-Serra, Armando', 'author..."
freq,26,546,720,546


#### explode authors

In [0]:
small_explosion_filter_exploded= small_explosion_filter.explode('authors')
small_explosion_filter_exploded.reset_index(inplace=True, drop=True)
small_explosion_filter_exploded#.describe()

In [20]:
# check author data
small_explosion_filter_exploded.authors[0]

{'author': 'Sundaramurthy, Srilekha',
 'author_abbreviated': 'Sundaramurthy S',
 'affiliations': ['1SN Oil and Natural Gas Corporation (ONGC) Department of Genetics & Molecular Biology, Vision Research Foundation, Chennai, India. srilekhasundar@gmail.com.'],
 'first_name': 'Srilekha',
 'last_name': 'Sundaramurthy'}

In [21]:
# prune authors name
small_explosion_filter_exploded['authors'] = small_explosion_filter_exploded['authors'].apply(lambda x: x['author'])
small_explosion_filter_exploded

Unnamed: 0,gene,pubmed_id,keywords,authors
0,MT-ND1,33185731,Gene therapy,"Sundaramurthy, Srilekha"
1,MT-ND1,33185731,Gene therapy,"SelvaKumar, Ambika"
2,MT-ND1,33185731,Gene therapy,"Ching, Jared"
3,MT-ND1,33185731,Gene therapy,"Dharani, Vidhya"
4,MT-ND1,33185731,Gene therapy,"Sarangapani, Sripriya"
...,...,...,...,...
24064,USH1C,31586696,Gene therapy,"Avraham, Karen B"
24065,CXCL8,31085105,Precision medicine,"Yang, Gee Su"
24066,CXCL8,31085105,Precision medicine,"Barnes, Natalie M"
24067,CXCL8,31085105,Precision medicine,"Lyon, Debra E"


In [62]:
if simple_bool('save it?'):
    directory = r'grpm_surveys\GPKA_surveys'
    if not os.path.exists(directory):
        os.makedirs(directory)
    small_explosion_filter_exploded.to_csv(r"grpm_surveys\GPKA_surveys\genetic-disease-therapy_single-author.csv")

In [52]:
# groupby author
grp2 = small_explosion_filter_exploded.groupby(by='authors').describe()

grp2.columns = grp2.columns.to_flat_index()
grp2.columns

Index([      ('gene', 'count'),      ('gene', 'unique'),
               ('gene', 'top'),        ('gene', 'freq'),
        ('pubmed_id', 'count'), ('pubmed_id', 'unique'),
          ('pubmed_id', 'top'),   ('pubmed_id', 'freq'),
         ('keywords', 'count'),  ('keywords', 'unique'),
           ('keywords', 'top'),    ('keywords', 'freq')],
      dtype='object')

In [63]:
# sort authors by (choose index)
num = 4
grp2_sorted = grp2.sort_values(by=grp2.columns[num], ascending=False)
if simple_bool('save csv?'):
    grp2.to_csv('grpm_surveys\GPKA_surveys\genetic-disease-therapy_single-author_chart.csv')

display(grp2_sorted)

Unnamed: 0_level_0,"(gene, count)","(gene, unique)","(gene, top)","(gene, freq)","(pubmed_id, count)","(pubmed_id, unique)","(pubmed_id, top)","(pubmed_id, freq)","(keywords, count)","(keywords, unique)","(keywords, top)","(keywords, freq)"
authors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"Muñoz, María de Lourdes",546,546,SNTG2,1,546,1,31872004,546,546,1,Genetic markers,546
"Totomoch-Serra, Armando",546,546,SNTG2,1,546,1,31872004,546,546,1,Genetic markers,546
"Escalante, Doris Pinto",546,546,SNTG2,1,546,1,31872004,546,546,1,Genetic markers,546
"Díaz-Badillo, Álvaro",546,546,SNTG2,1,546,1,31872004,546,546,1,Genetic markers,546
"Domínguez-Cruz, Miriam Givisay",546,546,SNTG2,1,546,1,31872004,546,546,1,Genetic markers,546
...,...,...,...,...,...,...,...,...,...,...,...,...
"Winther, Michael D",1,1,VKORC1,1,1,1,29986700,1,1,1,Precision medicine,1
"Russell, Jacquelyn Olivia",1,1,CTNNB1,1,1,1,25457204,1,1,1,Molecular therapy,1
"Sumner, Susan J",1,1,FADS1,1,1,1,27642271,1,1,1,Precision medicine,1
"Sun, Huizhuo",1,1,GAPDH,1,1,1,30670068,1,1,1,Gene therapy,1


#### Add journal

In [181]:
dummy_nbib.columns

Index(['gene', 'pubmed_id', 'citation_owner', 'nlm_status',
       'last_revision_date', 'electronic_issn', 'linking_issn',
       'journal_volume', 'journal_issue', 'publication_date', 'title',
       'abstract', 'authors', 'language', 'grants', 'publication_types',
       'electronic_publication_date', 'place_of_publication',
       'journal_abbreviated', 'journal', 'nlm_journal_id', 'descriptors',
       'pmcid', 'keywords', 'conflict_of_interest', 'received_time',
       'revised_time', 'accepted_time', 'pubmed_time', 'medline_time',
       'entrez_time', 'pii', 'doi', 'publication_status', 'print_issn',
       'pages'],
      dtype='object')

In [64]:
# choose columns
cols = [
    'pubmed_id',
    'journal',
    'journal_abbreviated',
    'nlm_journal_id',
    'publication_date'
]
dummy_pivot = dummy_nbib[cols].drop_duplicates()
dummy_pivot

Unnamed: 0,pubmed_id,journal,journal_abbreviated,nlm_journal_id,publication_date
0,29980632,Essays in biochemistry,Essays Biochem,0043306,2018 Jul 20
1,29133631,Indian journal of ophthalmology,Indian J Ophthalmol,0405376,2017 Nov
2,31996241,Orphanet journal of rare diseases,Orphanet J Rare Dis,101266602,2020 Jan 29
3,34122299,Frontiers in neurology,Front Neurol,101546899,2021
4,33159657,Drugs,Drugs,7600076,2021 Jan
...,...,...,...,...,...
6,27612015,Transfusion,Transfusion,417360,2016 Dec
8,12393480,Blood,Blood,7603509,2003 Jan 15
10,16371048,Transfusion,Transfusion,417360,2005 Dec
11,15660834,Transfusion,Transfusion,417360,2005 Feb


In [65]:
# merge dummy_pivot with
gpkaj = pd.merge(small_explosion_filter_exploded, dummy_pivot, on='pubmed_id')
gpkaj

Unnamed: 0,gene,pubmed_id,keywords,authors,journal,journal_abbreviated,nlm_journal_id,publication_date
0,MT-ND1,33185731,Gene therapy,"Sundaramurthy, Srilekha",Graefe's archive for clinical and experimental...,Graefes Arch Clin Exp Ophthalmol,8205248,2021 Sep
1,MT-ND1,33185731,Gene therapy,"SelvaKumar, Ambika",Graefe's archive for clinical and experimental...,Graefes Arch Clin Exp Ophthalmol,8205248,2021 Sep
2,MT-ND1,33185731,Gene therapy,"Ching, Jared",Graefe's archive for clinical and experimental...,Graefes Arch Clin Exp Ophthalmol,8205248,2021 Sep
3,MT-ND1,33185731,Gene therapy,"Dharani, Vidhya",Graefe's archive for clinical and experimental...,Graefes Arch Clin Exp Ophthalmol,8205248,2021 Sep
4,MT-ND1,33185731,Gene therapy,"Sarangapani, Sripriya",Graefe's archive for clinical and experimental...,Graefes Arch Clin Exp Ophthalmol,8205248,2021 Sep
...,...,...,...,...,...,...,...,...
24392,COL2A1,32894162,Precision medicine,"Agerholm, Jørgen Steen",Acta veterinaria Scandinavica,Acta Vet Scand,0370400,2020 Sep 7
24393,USH1C,31075354,Gene therapy,"Ma, Yutian",Pharmacology & therapeutics,Pharmacol Ther,7905840,2019 Aug
24394,USH1C,31075354,Gene therapy,"Wise, Andrew K",Pharmacology & therapeutics,Pharmacol Ther,7905840,2019 Aug
24395,USH1C,31075354,Gene therapy,"Shepherd, Robert K",Pharmacology & therapeutics,Pharmacol Ther,7905840,2019 Aug


In [66]:

if simple_bool('save it?'):
    #name = input('filename tag?')
    gpkaj.to_csv(r"grpm_surveys\GPKA_surveys\GPKAJ_genetic-disease-therapy.csv")

#### add impact

In [206]:
impact_df = pd.read_csv(r"grpm_dataset\grpm_db_pcg\impact factor_2022.txt")
impact_df.columns

Index(['Sl. No.', 'Journal name', '2022 JIF'], dtype='object')

In [207]:
impact_df

Unnamed: 0,Sl. No.,Journal name,2022 JIF
0,1,CA-A CANCER JOURNAL FOR CLINICIANS,254.7
1,2,LANCET,168.9
2,3,NEW ENGLAND JOURNAL OF MEDICINE,158.5
3,4,JAMA-JOURNAL OF THE AMERICAN MEDICAL ASSOCIATION,120.7
4,5,NATURE REVIEWS DRUG DISCOVERY,120.1
...,...,...,...
9480,9481,Slovenian Veterinary Research,0.1
9481,9482,WEST INDIAN MEDICAL JOURNAL,0.1
9482,9483,ZKG INTERNATIONAL,0.1
9483,9484,By exporting the selected data; you agree to t...,


In [193]:
gpkaj.describe()

Unnamed: 0,gene,pubmed_id,keywords,authors,journal,journal_abbreviated,nlm_journal_id,publication_date
count,24397,24397,24397,24397,24397,24397,24397,24397
unique,1636,699,36,5591,358,358,364,342
top,TP53,31872004,Precision medicine,"Muñoz, María de Lourdes",Data in brief,Data Brief,101654995,2020 Feb
freq,234,4368,6334,546,4841,4841,4841,4848


In [213]:
gpkaj2 = gpkaj.copy()
impact_df['Journal name'] = impact_df['Journal name'].str.lower()
gpkaj2['journal'] = gpkaj2['journal'].str.lower()
gpkajif = pd.merge(gpkaj2, impact_df, left_on='journal', right_on='Journal name')
gpkajif

Unnamed: 0,gene,pubmed_id,keywords,authors,journal,journal_abbreviated,nlm_journal_id,publication_date,Sl. No.,Journal name,2022 JIF
0,MT-ND1,33335957,Gene therapy,"Artika, I Made",genes & diseases,Genes Dis,101635967,2020 Dec,1026,genes & diseases,6.8
1,MT-ND2,33335957,Gene therapy,"Artika, I Made",genes & diseases,Genes Dis,101635967,2020 Dec,1026,genes & diseases,6.8
2,MT-ND4,33335957,Gene therapy,"Artika, I Made",genes & diseases,Genes Dis,101635967,2020 Dec,1026,genes & diseases,6.8
3,MT-ND5,33335957,Gene therapy,"Artika, I Made",genes & diseases,Genes Dis,101635967,2020 Dec,1026,genes & diseases,6.8
4,MT-ND6,33335957,Gene therapy,"Artika, I Made",genes & diseases,Genes Dis,101635967,2020 Dec,1026,genes & diseases,6.8
...,...,...,...,...,...,...,...,...,...,...,...
14549,PCSK9,33867421,Genetic diagnosis,"Hori, Mika",journal of atherosclerosis and thrombosis,J Atheroscler Thromb,9506298,2021 Jul 1,2271,journal of atherosclerosis and thrombosis,4.4
14550,PCSK9,33867421,Genetic diagnosis,"Matsuki, Kota",journal of atherosclerosis and thrombosis,J Atheroscler Thromb,9506298,2021 Jul 1,2271,journal of atherosclerosis and thrombosis,4.4
14551,PCSK9,33867421,Genetic diagnosis,"Minamino, Tetsuo",journal of atherosclerosis and thrombosis,J Atheroscler Thromb,9506298,2021 Jul 1,2271,journal of atherosclerosis and thrombosis,4.4
14552,PCSK9,33867421,Genetic diagnosis,"Yokoyama, Shinji",journal of atherosclerosis and thrombosis,J Atheroscler Thromb,9506298,2021 Jul 1,2271,journal of atherosclerosis and thrombosis,4.4


In [220]:
gpkajif.columns
cols= ['Sl. No.','Journal name']
gpkajif.drop(cols, axis=1, inplace=True)

In [221]:
gpkajif.sort_values(by='2022 JIF',ascending=False).reset_index(drop=True).to_csv(r"G:\Altri computer\Razor\Nutrigenetica\Bioinformatics\GRPMX - Razor\GRPMX db - Razor\grpm_dataset\grpm_db_pcg\GPKAJIF_survey_special_issue.csv")

In [209]:
gpkajif[['gene', 'pubmed_id', 'keywords', 'authors', 'journal',
'journal_abbreviated', 'nlm_journal_id', 'publication_date']].describe()

Unnamed: 0,gene,pubmed_id,keywords,authors,journal,journal_abbreviated,nlm_journal_id,publication_date
count,24397,24397,24397,24397,24397,24397,24397,24397
unique,1636,699,36,5591,358,358,364,342
top,TP53,31872004,Precision medicine,"Muñoz, María de Lourdes",data in brief,Data Brief,101654995,2020 Feb
freq,234,4368,6334,546,4841,4841,4841,4848


In [217]:
gpkaj[['gene', 'pubmed_id', 'keywords', 'authors', 'journal',
         'journal_abbreviated', 'nlm_journal_id', 'publication_date']].describe()

Unnamed: 0,gene,pubmed_id,keywords,authors,journal,journal_abbreviated,nlm_journal_id,publication_date
count,24397,24397,24397,24397,24397,24397,24397,24397
unique,1636,699,36,5591,358,358,364,342
top,TP53,31872004,Precision medicine,"Muñoz, María de Lourdes",Data in brief,Data Brief,101654995,2020 Feb
freq,234,4368,6334,546,4841,4841,4841,4848


In [190]:
type(small_explosion_filter.pubmed_id[0]) #= small_explosion_filter.pubmed_id.astype(str)

str

In [71]:
grp = small_explosion_filter.groupby(by='keywords').describe()
grp.columns = grp.columns.to_flat_index()
grp.sort_values(by=grp.columns[5],ascending=False)#.to_csv('grpm_surveys\GPKA_surveys\genetic-disease-therapy_report.csv')

Unnamed: 0_level_0,"(gene, count)","(gene, unique)","(gene, top)","(gene, freq)","(pubmed_id, count)","(pubmed_id, unique)","(pubmed_id, top)","(pubmed_id, freq)","(authors, count)","(authors, unique)","(authors, top)","(authors, freq)"
keywords,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Precision medicine,720,499,TP53,12,720,176,31845553,51,720,181,"[{'author': 'Spiller, Wes', 'author_abbreviate...",51
Gene therapy,212,154,F9,10,212,133,34909657,15,212,133,"[{'author': 'Khatri, Dharmendra Kumar', 'autho...",15
Newborn screening,224,121,SLC22A5,11,224,102,27578510,33,224,103,"[{'author': 'Park, Kyoung Jin', 'author_abbrev...",33
Genetic counseling,123,97,BRCA1,5,123,60,29929473,9,123,60,"[{'author': 'Jia, Shuqin', 'author_abbreviated...",9
Molecular diagnosis,219,209,GLA,3,219,46,28912962,49,219,46,"[{'author': 'Costa, Kárita Antunes', 'author_a...",49
Genetic diagnosis,123,113,GJB2,2,123,41,31203817,19,123,41,"[{'author': 'Arts, Peer', 'author_abbreviated'...",19
Gene editing,51,42,PSEN2,4,51,27,33109263,12,51,27,"[{'author': 'Eguizabal, C', 'author_abbreviate...",12
Genetic markers,647,631,NPC1,3,647,20,31872004,546,647,20,"[{'author': 'Totomoch-Serra, Armando', 'author...",546
Genetic screening,36,33,GJB2,3,36,20,23940833,13,36,20,"[{'author': 'Wagner, Jennifer K', 'author_abbr...",13
Gene Therapy,11,10,IL17A,2,11,10,28670513,2,11,10,"[{'author': 'Zakikhan, Kobra', 'author_abbrevi...",2


In [63]:
grp.columns[5]

('pubmed_id', 'unique')

### Author access point

In [41]:
informations = [
    {
        'author': 'Manickam, Agaath Hedina',
        'author_abbreviated': 'Manickam AH',
        'affiliations': ['Molecular Genetics and Cancer Biology Laboratory, Department of Human Genetics and Molecular Biology, Bharathiar University, Coimbatore, Tami Nadu, India.'],
        'first_name': 'Agaath Hedina',
        'last_name': 'Manickam'
    },
    {
        'author': 'Michael, Minu Jenifer',
        'author_abbreviated': 'Michael MJ',
        'affiliations': ['Molecular Genetics and Cancer Biology Laboratory, Department of Human Genetics and Molecular Biology, Bharathiar University, Coimbatore, Tami Nadu, India.'],
        'first_name': 'Minu Jenifer',
        'last_name': 'Michael'
    },
    {
        'author': 'Ramasamy, Sivasamy',
        'author_abbreviated': 'Ramasamy S',
        'affiliations': ['Department of Human Genetics and Molecular Biology, Bharathiar University, Coimbatore, Tami Nadu, India.'],
        'first_name': 'Sivasamy',
        'last_name': 'Ramasamy'
    }
] # una lista di tre dizionari
type(informations[1])
first_author = informations[0]
print("First Author:")
print("Full Name:", first_author['author'])
print("Abbreviated Name:", first_author['author_abbreviated'])
print("Affiliations:", first_author['affiliations'])
print("First Name:", first_author['first_name'])
print("Last Name:", first_author['last_name'])

First Author:
Full Name: Manickam, Agaath Hedina
Abbreviated Name: Manickam AH
Affiliations: ['Molecular Genetics and Cancer Biology Laboratory, Department of Human Genetics and Molecular Biology, Bharathiar University, Coimbatore, Tami Nadu, India.']
First Name: Agaath Hedina
Last Name: Manickam


In [20]:
x = dummy_nbib.columns[1]

In [11]:
#gene nbib lookup
gene = 'APOA1'

gene_nbib = dummy_nbib.loc[dummy_nbib['gene'] == gene]
gene_nbib['descriptors']#.iloc[0]
gene_nbib

Unnamed: 0,gene,pubmed_id,citation_owner,nlm_status,last_revision_date,electronic_issn,linking_issn,journal_volume,journal_issue,publication_date,...,revised_time,accepted_time,pubmed_time,medline_time,entrez_time,pii,doi,publication_status,print_issn,pages
0,APOA1,33926105,NLM,PubMed-not-MEDLINE,2021-05-18,2077-0383,2077-0383,10,9,2021 Apr 26,...,2021-04-16,2021-04-22,2021-05-01 06:00:00,2021-05-01 06:01:00,2021-04-30 01:34:00,jcm-10-01880,10.3390/jcm10091880,epublish,2077-0383,
1,APOA1,28342064,NLM,MEDLINE,2022-04-08,1861-0692,1861-0684,106,9,2017 Sep,...,,2017-03-14,2017-03-28 06:00:00,2018-07-14 06:00:00,2017-03-26 06:00:00,1106,10.1007/s00392-017-1106-1,ppublish,1861-0684,663-675
2,APOA1,34696795,NLM,MEDLINE,2022-02-18,1476-511X,1476-511X,20,1,2021 Oct 25,...,,2021-09-20,2021-10-27 06:00:00,2022-02-19 06:00:00,2021-10-26 05:34:00,1562,10.1186/s12944-021-01562-1,epublish,,143
3,APOA1,32604774,NLM,MEDLINE,2021-03-31,2073-4409,2073-4409,9,6,2020 Jun 26,...,2020-06-20,2020-06-22,2020-07-02 06:00:00,2021-04-01 06:00:00,2020-07-02 06:00:00,cells-09-01553,10.3390/cells9061553,epublish,,
4,APOA1,30891095,NLM,PubMed-not-MEDLINE,2020-09-28,1754-1611,1754-1611,13,,2019,...,,2018-12-09,2019-03-21 06:00:00,2019-03-21 06:01:00,2019-03-21 06:00:00,130,10.1186/s13036-018-0130-7,epublish,1754-1611,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
383,APOA1,12950070,NLM,MEDLINE,2006-11-15,,1099-498X,5,9,2003 Sep,...,,,2003-09-02 05:00:00,2004-05-08 05:00:00,2003-09-02 05:00:00,,10.1002/jgm.403,ppublish,1099-498X,795-802
384,APOA1,10858436,NLM,MEDLINE,2021-02-12,,0021-9258,275,35,2000 Sep 1,...,,,2000-06-20 09:00:00,2000-10-07 11:01:00,2000-06-20 09:00:00,S0021-9258(19)61449-3,10.1074/jbc.M002841200,ppublish,0021-9258,26821-7
385,APOA1,15900219,NLM,MEDLINE,2019-11-09,,1744-6872,15,6,2005 Jun,...,,,2005-05-19 09:00:00,2005-12-16 09:00:00,2005-05-19 09:00:00,01213011-200506000-00011,10.1097/01213011-200506000-00011,ppublish,1744-6872,441-6
386,APOA1,3141894,NLM,MEDLINE,2006-11-15,,0031-3998,24,2,1988 Aug,...,,,1988-08-01 00:00:00,1988-08-01 00:01:00,1988-08-01 00:00:00,,10.1203/00006450-198808000-00017,ppublish,0031-3998,222-8


In [None]:
#mask full nbib with filtered pmids for gene query:
mask = gene_nbib.pubmed_id.isin(gene_filtered_grpm.pmids)
gene_nbib_filtered = gene_nbib[mask]
gene_nbib_filtered.abstract#.to_clipboard()

#filter for rsid
gene_filtered_grpm_rsid = gene_filtered_grpm[gene_filtered_grpm.rsid == 'rs2266788']

mask = gene_nbib.pubmed_id.isin(gene_filtered_grpm_rsid.pmids)
gene_nbib_filtered_rsid = gene_nbib[mask]

gene_nbib_filtered.abstract#.to_clipboard()

In [None]:
gene_pmids = list(gene_nbib['pubmed_id'].drop_duplicates())#.to_clipboard(index=False) # --> to VEP http://www.ensembl.org/Tools/VEP
query = " OR ".join(gene_pmids)
pyperclip.copy(query)

## Import GRPM data from Dataset


In [None]:
# IMPORT FULL GRPM DATASET FROM DATABESE----------------------------------
time2 = datetime.now()

#import gene-rsidpmidmesh
pcg_grpm = pd.read_csv('grpm_dataset/grpm_db_pcg/grpm_table_output.csv', index_col=0, dtype={'pmids': str})
rna_grpm = pd.read_csv('grpm_dataset/grpm_db_rna/grpm_table_output.csv', index_col=0, dtype={'pmids': str})
pseudo_grpm = pd.read_csv('grpm_dataset/grpm_db_pseudo/grpm_table_output.csv', index_col=0, dtype={'pmids': str})

print('time import grpm: ', datetime.now()-time2)
print('pcg',pcg_grpm.memory_usage().sum() / 1024 / 1024, 'MB')
print('rna',rna_grpm.memory_usage().sum() / 1024 / 1024, 'MB')
print('pseudo',pseudo_grpm.memory_usage().sum() / 1024 / 1024, 'MB')

In [None]:
#GRPM DB statistics
print('GRPM DB shape:')
print('pcg',pcg_grpm.    shape)
print('rna',rna_grpm.      shape)
print('pseudo',pseudo_grpm.shape)

print('GRPM PCG DB statistics:')
grpmdb_genes = pcg_grpm.gene.nunique()
grpmdb_rsids = pcg_grpm.rsid.nunique()
grpmdb_pmids = pcg_grpm.pmids.nunique()
grpmdb_meshs = pcg_grpm.mesh.nunique()
print(grpmdb_genes,'genes on', len(protein_coding_genes_list),)
print(grpmdb_rsids,'rsid')
print(grpmdb_pmids,'pmid')
print(grpmdb_meshs,'mesh')

print('\nGRPM RNA DB statistics:')
grpmdb_genes = rna_grpm.gene.nunique()
grpmdb_rsids = rna_grpm.rsid.nunique()
grpmdb_pmids = rna_grpm.pmids.nunique()
grpmdb_meshs = rna_grpm.mesh.nunique()
print(grpmdb_genes,'genes on', len(RNA_genes_list),)
print(grpmdb_rsids,'rsid')
print(grpmdb_pmids,'pmid')
print(grpmdb_meshs,'mesh')

print('\nGRPM PSEUDO DB statistics:')
grpmdb_genes = pseudo_grpm.gene.nunique()
grpmdb_rsids = pseudo_grpm.rsid.nunique()
grpmdb_pmids = pseudo_grpm.pmids.nunique()
grpmdb_meshs = pseudo_grpm.mesh.nunique()
print(grpmdb_genes,'genes on', len(pseudo_genes_list),)
print(grpmdb_rsids,'rsid')
print(grpmdb_pmids,'pmid')
print(grpmdb_meshs,'mesh')

In [None]:
#gene grpm lookup
gene_grpm = dummy_grpm.loc[dummy_grpm['gene'] == gene]
gene_grpm = gene_grpm#[['gene', 'rsid', 'pmids', 'mesh', 'qualifier', 'major']]#.drop_duplicates().reset_index(drop=True)

gene_grpm#.head(len(gene_pmidrsidmesh))

#containing word Mesh-LOOKUP
gene_grpm[gene_grpm.mesh.str.contains('poly', case=False)]

## Extra

###  Add study type with Eutils-API

In [None]:
from Bio import Entrez

def get_study_type(pmids):

    Entrez.email = 'your_email@your_domain.com'

    # Retrieve the metadata for the articles
    handle = Entrez.esummary(db='pubmed', id=','.join(pmids), retmode='xml')
    records = Entrez.parse(handle)

    # Extract the article types from the metadata
    study_types = []
    for record in records:
        article_types = record['PubTypeList']
        #print(record['PubTypeList'])
        # Determine the study type based on the article types
        if 'Randomized Controlled Trial' in article_types:
            study_types.append('Randomized Controlled Trial')
        elif 'Controlled Clinical Trial' in article_types:
            study_types.append('Controlled Clinical Trial')
        elif 'Cohort Studies' in article_types:
            study_types.append('Cohort Study')
        elif 'Case-Control Studies' in article_types:
            study_types.append('Case-Control Study')
        elif 'Review' in article_types:
            study_types.append('Review')
        elif 'Clinical Trial' in article_types:
            study_types.append('Clinical Trial')
        elif 'Meta-Analysis' in article_types:
            study_types.append('Meta-Analysis')
        elif 'Multicenter Study' in article_types:
            study_types.append('Multicenter Study')
        # Add additional conditions to handle other study types as needed
        else:
            study_types.append('Unknown')

    return study_types

In [None]:
Entrez.email = 'your_email@your_domain.com'

# Retrieve the metadata for the articles
handle = Entrez.esummary(db='pubmed', id=','.join(['34556834', '26620191', '33006084']), retmode='xml')
records = Entrez.parse(handle)
dfa =pd.DataFrame()
for i in records:
    df = pd.json_normalize(i)
    dfa = pd.concat([dfa,df])

dfa.T

In [None]:
# Extract the article types from the metadata
study_types = []
for record in records:
        article_types = record['PubTypeList']
        print(record['PubTypeList'])
        # Determine the study type based on the article types
        if 'Randomized Controlled Trial' in article_types:
            study_types.append('Randomized Controlled Trial')
        elif 'Controlled Clinical Trial' in article_types:
            study_types.append('Controlled Clinical Trial')
        elif 'Cohort Studies' in article_types:
            study_types.append('Cohort Study')
        elif 'Case-Control Studies' in article_types:
            study_types.append('Case-Control Study')
        elif 'Review' in article_types:
            study_types.append('Review')
        # Add additional conditions to handle other study types as needed
        else:
            study_types.append('Unknown')

In [None]:
get_study_type(['34556834', '31533339', '33006084'])

In [None]:
#it takes time
ftopmids_str = list(map(str, ftopmids))
study_type = get_study_type(ftopmids_str)

In [None]:
pmids_studytype = pd.DataFrame(list(zip(ftopmids_str,study_type)),columns=[gene+'_PMID','study type'])
pmids_studytype_count = pmids_studytype.groupby('study type').describe().reset_index()
pmids_studytype_count.columns = pmids_studytype_count.columns.to_flat_index()
new_column_names = ['study_type', 'pmid-count', 'pmid-unique','pmid-top','pmid-freq']
pmids_studytype_count.columns = new_column_names
pmids_studytype_count


In [None]:
rand = pmids_studytype.loc[pmids_studytype['study type']=='Randomized Controlled Trial']
#rand = pmids_studytype.loc[pmids_studytype['study type']!='Unknown']
#rand = pmids_studytype.loc[pmids_studytype['study type']=='Unknown']
rand_gene_PMID = list(rand[gene+'_PMID'])
#conseqf = conseq.loc[conseq['SYMBOL'] == gene]
len(rand_gene_PMID)

#quert on pubmed:
pyperclip.copy(" OR ".join(list(map(str,rand_gene_PMID))))

In [None]:
#check
rand_FTO_PMID_str = list(map(str, rand.FTO_PMID))
study_type_control = get_study_type(rand_FTO_PMID_str)

In [None]:
#study_type_control

#### Abstract analysis - trial

In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora
import nltk
from nltk.stem import PorterStemmer

# create a Porter stemmer object
p_stemmer = PorterStemmer()

# list of abstracts
abstracts = ['It has been suggested that Neel s "thrifty genotype" model may account for high body weights in some Oceanic populations, which presumably arose in modern times. In European populations, common variants (rs1421085-C, rs17817449-G, and rs9939609-A) in the fat mass and obesity (FTO associated) were recently found to be associated with body mass index (BMI) or obesity. In this study, we investigated the population frequencies of these variants in six Oceanic populations (Melanesians, Micronesians, and Polynesians) and tested for an association with BMI. Unlike European populations, the Oceanic populations displayed no significant association between the FTO polymorphisms and BMI. These variants were in strong linkage disequilibrium. The population frequencies ranged between 4.2 and 30.3% in the six Oceanic populations, and were similar to those in southeast and east Asian populations. Our study of the FTO polymorphisms has generated no evidence to support the thrifty genotype hypothesis for Oceanic populations.',
             'Variations in the fat-mass and obesity-associated gene (FTO) are associated with the obesity phenotype in many Caucasian populations. This association with the obesity phenotype is not clear in the Japanese. To investigate the relationship between the FTO gene and obesity in the Japanese, we genotyped single nucleotide polymorphisms (SNPs) in the FTO genes from severely obese subjects [n = 927, body mass index (BMI) > or = 30 kg/m2] and normal-weight control subjects (n = 1,527, BMI < 25 kg/m2). A case-control association analysis revealed that 15 SNPs, including rs9939609 and rs1121980, in a linkage disequilibrium (LD) block of approximately 50 kb demonstrated significant associations with obesity',
             'BACKGROUND: Recently, the role of the FTO (fat mass and obesity associated) gene in obesity development was described in Western European, but not in Oceanic, cohorts. The objective of this study was to test the hypothesis that the FTO single nucleotide polymorphism (SNP) is associated with body mass index (BMI) in the Slavic population and to analyze if there could be sex-specific effects of the SNP on BMI, waist-to-hip ratio (WHR), and lipid parameters. METHODS: We analyzed three large population-based samples comprising the post-MONICA study (1191 males, 1368 females) and the 3PMFs study (908 females). RESULTS: FTO rs17817449 SNP was related to BMI in males (p=0.014). In the females from both the post-MONICA and the 3PMFs study, FTO had no effect on BMI. Sub-analysis of females from the 3PMFs study demonstrated that FTO had an effect on BMI in postmenopausal females (p=0.035) but not in premenopausal females (follicle-stimulating hormone <40 U/L was used as marker of premenopausal status). WHR and lipid parameters were not associated with FTO in any of the analyzed groups. CONCLUSIONS: These results suggest that the effect of FTO SNP rs17817449 may be, in some populations at least, restricted to males and postmenopausal females.',
             'Participants analyzed actual and simulated longitudinal data from the Framingham Heart Study for various metabolic and cardiovascular traits. The genetic information incorporated into these investigations ranged from selected single-nucleotide polymorphisms to genome-wide association arrays. Genotypes were incorporated using a broad range of methodological approaches including conditional logistic regression, linear mixed models, generalized estimating equations, linear growth curve estimation, growth modeling, growth mixture modeling, population attributable risk fraction based on survival functions under the proportional hazards models, and multivariate adaptive splines for the analysis of longitudinal data. The specific scientific questions addressed by these different approaches also varied, ranging from a more precise definition of the phenotype, bias reduction in control selection, estimation of effect sizes and genotype associated risk, to direct incorporation of genetic data into longitudinal modeling approaches and the exploration of population heterogeneity with regard to longitudinal trajectories. The group reached several overall conclusions: (1) The additional information provided by longitudinal data may be useful in genetic analyses. (2) The precision of the phenotype definition as well as control selection in nested designs may be improved, especially if traits demonstrate a trend over time or have strong age-of-onset effects. (3) Analyzing genetic data stratified for high-risk subgroups defined by a unique development over time could be useful for the detection of rare mutations in common multifactorial diseases. (4) Estimation of the population impact of genomic risk variants could be more precise. The challenges and computational complexity demanded by genome-wide single-nucleotide polymorphism data were also discussed.',
             'This is the fifth abstract.']

# function to preprocess the abstracts
def preprocess_abstracts(abstracts):
    for abstract in abstracts:
        # tokenize the abstract into individual words
        tokens = simple_preprocess(abstract, deacc=True)
        # remove stop words
        stopped_tokens = [token for token in tokens if not token in STOPWORDS]
        # stem the tokens
        stemmed_tokens = [p_stemmer.stem(token) for token in stopped_tokens]
        yield stemmed_tokens

# preprocess the abstracts
processed_abstracts = list(preprocess_abstracts(abstracts))

# create a dictionary from the processed abstracts
dictionary = corpora.Dictionary(processed_abstracts)

# create a corpus from the dictionary and the processed abstracts
corpus = [dictionary.doc2bow(abstract) for abstract in processed_abstracts]

# train an LSI (Latent Semantic Indexing) model on the corpus
lsi_model = gensim.models.LsiModel(corpus=corpus, id2word=dictionary, num_topics=2)

# print the top 5 topics
print(lsi_model.print_topics(num_topics=8, num_words=20))
