In [None]:
import json
import pandas as pd
import numpy as np
import spacy
from unidecode import unidecode
import re
import matplotlib.pyplot as plt
from collections import OrderedDict

## **1. Tareas de Adquisición de Data**
---
Con ayuda de la api PubTator 3 https://www.ncbi.nlm.nih.gov/research/pubtator3/ se puede extraer una colección de textos en formato tipo Json. La colección de textos hacen parte de los resumenes (abstracts) de interés. En esta etapa se incluye la extracción de ciertas características de los textos. En nuestro caso, nos interesaba extraer segmentos de palabras que correspondieran a las categorías: Disease, Specie y gracias a una lista (bolsa de palabras) elaborada previamente se pudo extraer los términos relacionados a (lípidos) para cada texto (abstract) de interés.

De los archivos tipo .json se puede extraer la siguiente información relevante

*   id
*   abstract
*   disease
*   specie
*   peptide

In [None]:
spacy.cli.download("en_core_web_sm")

nlp = spacy.load(
        "en_core_web_sm",
        exclude=[
            "tok2vec",
            "morphologizer",
            "parser",
            "senter",
            "attribute_ruler",
            "lemmatizer",
            "ner"
            ]
        )

In [None]:
import glob
files = glob.glob("*.json")
print (files)

In [None]:
def json_to_list(numb_articles):
    #Function to join all json files
    output_data = []
    #number_articles = np.arange(100, numb_articles+100, 100).tolist()
    #for i in number_articles:
    for i in files:
        with open(i) as f:
            for line in f:
                output_data.append(json.loads(line))
    return output_data

In [None]:
def extract_jsons_to_df(input_data):
    #Function to extract Data from all json files
    df_article = pd.DataFrame()
    articles_id =[]
    articles_journal =[]
    articles_abstract = []
    articles_disease = []
    articles_species= []

    for i in range (0, len(input_data)):
        articles_id.append(input_data[i]["id"])
        #articles_journal.append (input_data[i]["journal"])
        articles_abstract.append(input_data[i]["passages"][1]["text"])
        Diseases = []
        Species = []
        for j in range (0,len(input_data[i]["passages"][1]["annotations"])):
            if input_data[i]["passages"][1]["annotations"][j]["infons"]["type"] == "Disease":
                Diseases.append(input_data[i]["passages"][1]["annotations"][j]["text"])
            if input_data[i]["passages"][1]["annotations"][j]["infons"]["type"] == "Species":
                Species.append(input_data[i]["passages"][1]["annotations"][j]["text"])
        articles_disease.append(Diseases)
        articles_species.append(Species)

    df_article["id"] = articles_id
    #df_article["journal"] = articles_journal
    df_article["abstract"] = articles_abstract
    df_article["diseases"] = articles_disease
    df_article["species"] = articles_species

    return df_article

In [None]:
def preprocess(text, nlp):
    # Function to pre-process abstracts
    # Normalizamos el texto
    norm_text = unidecode(text).lower()
    # Eliminamos caracteres especiales
    pat = re.compile(r"[^a-z]")
    clean_text = re.sub(pat, " ", norm_text)
    # Eliminamos espacios duplicados
    spaces = re.compile(r"\s{2,}")
    spaces_text = re.sub(spaces," ", clean_text)
    # Extraemos tokens
    tokens = list(nlp(spaces_text))
    # Filtramos palabras por longitud
    filtered_tokens = filter(
            lambda token: (
                len(token) > 3 and
                not token.is_stop  # Filtramos stopwords
                ),
            tokens
        )
    filtered_text = " ".join(token.text for token in filtered_tokens)

    return filtered_text.strip()

In [None]:
def filtering_peptides(df,list_p):
    # Function to filter peptides in abstracts
    filtered_peptides = []
    for i in range (0,len(df["abstract"].to_list())):
        text = df["abstract"].to_list()[i]
        tokens = list(nlp(text))
        filtered_tokens = filter(lambda token: (token.text in list_p), tokens)
        filtered_peptides.append(" ".join(token.text for token in filtered_tokens))
    return filtered_peptides

In [None]:
# Lectura de archivos json y conversión en dataframe
data = json_to_list(17900)
df_all_articles = extract_jsons_to_df(data)

In [None]:
#pre-process abstracts
df_all_articles["abstract"] = df_all_articles["abstract"].apply(lambda x: preprocess(x,nlp))
#filtering peptides
list_peptides = ['abaecin', 'acaloleptin', 'acanthaporin', 'acanthoscurrin', 'acidocin', 'acipensin', 'actagardine', 'adepantin', 'adrenomedullin', 'afusinc', 'agelaia', 'alamethicin', 'alarin', 'alloferon', 'allomyrinasin', 'alvinellacin', 'alyteserin', 'amoebapore', 'amolopin', 'amurin', 'amylin', 'amylocyclicin', 'andersonin', 'andricin', 'androctonin', 'androctonus', 'andropin', 'anoplin', 'antapin', 'apidaecin', 'arasin', 'arenicin', 'ascaphin', 'astacidin', 'aurein', 'aureocin', 'baceridin', 'bactenecin', 'bactericidin', 'bacteriocin', 'bactofencin', 'bactridine', 'bactrocerin', 'balteatide', 'batroxicidin','bldesin', 'bombinin', 'bombolitin', 'bovine', 'bradykinin', 'brazzein', 'brevifactin', 'brevinin', 'buforin', 'buthinin', 'butyrivibriocin', 'buwchitin', 'caenacin', 'caerin', 'caerulein', 'callinectin', 'capidermicin', 'capistruin', 'carnobacteriocin', 'carnocin', 'carnocyclin', 'casecidin', 'cathelicidin', 'cecropin', 'centrocin', 'ceratotoxin', 'cerecidin', 'chemerin', 'chensinin', 'chromacin', 'chrombacin', 'chrysophsin', 'cinnamycin', 'circularin', 'circulin', 'citrocin', 'citropin', 'clavanin', 'clavaspirin', 'cliotide', 'coleoptericin', 'colistin', 'coprisin', 'copsin', 'corticostatin', 'crabrolin', 'cremycin', 'crinicepsin', 'crotalicidin', 'crotamine', 'cryptdin', 'cryptonin', 'ctenidin', 'ctriporin', 'cupiennin', 'curvacin', 'curvaticin', 'cyanophlyctin', 'cyanovirin', 'cyclopsychotride', 'cyclosaplin', 'cycloviolacin', 'cycloviolin', 'cypemycin', 'cytolysin', 'dahlein', 'datucin', 'defensin', 'delftibactin', 'dendropsophin', 'dermaseptin', 'dermatoxin', 'dermcidin', 'deserticolin', 'desotamide', 'diapausin', 'diptericin', 'distinctin', 'divercin', 'divergicin', 'dolabellanin', 'dominulin', 'dosotamide', 'drosocin', 'drosomycin', 'duramycin', 'durancin', 'dybowskin', 'enbocin', 'enterocin', 'esculentin', 'fabatin', 'feleucin', 'fengycin', 'formaecin', 'formicin', 'frenatin', 'fusaricidin', 'gaegurin', 'gageostatin', 'gageotetrin', 'galensin', 'gallerimycin', 'gallidermin', 'gallin', 'gallinacin', 'gambicin', 'garvicin', 'geobacillin', 'ginkbilobin', 'gramicidin', 'griffithsin', 'griselimycin', 'guentherin', 'hadrurin', 'hainanenin', 'halictine', 'haliotisin', 'halocidin', 'halocin', 'hdmolluscidin', 'heliocin', 'heliomicin', 'hepcidin', 'heterin', 'heteroscorpine', 'hinnavin', 'hipposin', 'hiracin', 'hispidalin', 'holosin', 'holothuroidin', 'holotricin', 'hominicin', 'hymenochirin', 'hymo', 'hyphancin', 'hyposin', 'imcroporin', 'indolicidin', 'isracidin', 'ixodidin', 'ixosin', 'jaburetox', 'japonicin', 'jelleine', 'jindongenin', 'jingdongin', 'kalata', 'kaliocin', 'kassinatuerin', 'kassorin', 'kenojeinin', 'labaditin', 'labyrinthopeptin', 'lacrain', 'lactacin', 'lacticin', 'lactocin', 'lactococcin', 'lactococcin', 'lactocyclicin', 'lactoferricin', 'lactolisterin', 'lariatin', 'lasiocepsin', 'lasioglossin', 'lassomycin', 'laterosporulin', 'laticeptin', 'lebocin', 'leucocin', 'leucrocin', 'lichenicidin', 'lichenin', 'lividin', 'locustin', 'longicin', 'longicornsin', 'longipin', 'lucifensin', 'lucilin', 'lugensin', 'lumbricin', 'lunasin', 'lunatusin', 'lycocitin', 'lycotoxin', 'lynronne', 'lysozyme', 'macropin', 'maculatin', 'maeucath', 'magainin', 'marmelittin', 'mastoparan', 'maximin', 'medusin', 'megin', 'melectin', 'melimine', 'melittin', 'mersacidin', 'mesentericin', 'metalnikowin', 'metchnikowin', 'meucin', 'micasin', 'microbisporicin', 'microcin', 'micrococcin', 'microplusin', 'moronecidin', 'mucroporin', 'mundticin', 'muscin', 'mutacin', 'myticalin', 'mytichitin', 'myticin', 'mytilin', 'mytilus', 'mytimacin', 'myxinidin', 'nabaecin', 'naegleriapore', 'neuromacin', 'neurotensin', 'nicomicin', 'nigroain', 'nigrocin', 'nisin', 'nukacin', 'ocellatin', 'odoranain', 'odorranain', 'omwaprin', 'oncorhyncin', 'opiscorpine', 'opistoporin', 'oreoch', 'ovispirin', 'oxyopinin', 'oxysterlin', 'paenibacillin', 'paenibacterin', 'paenicidin', 'palicourein', 'palustrin', 'pandinin', 'panitide', 'pantocin', 'panurgine', 'panusin', 'papiliocin', 'papilosin', 'parabutoporin', 'paracentrin', 'paralithocin', 'parasin', 'pardaxin', 'parigidin', 'parkerin', 'patellamide', 'pediocin', 'pelophylaxin', 'pelovaterin', 'penaeidin', 'penisin', 'penocin', 'perfrin', 'perinerin', 'persulcatusin', 'phoratoxin', 'phormicin', 'phylloseptin', 'phylloxin', 'piceain', 'pilosulin', 'piscicolin', 'piscidin', 'planosporicin', 'plantaricin', 'plantaricyclin', 'plantazolicin', 'plasticin', 'plectasin', 'pleskein', 'pleurain', 'pleurocidin', 'plicatamide', 'polymyxin', 'ponericin', 'potamin', 'procambarin', 'prolixicin', 'prophenin', 'protegrin', 'protonectin', 'psacotheasin', 'psalmopeotoxin', 'psdefensin', 'pseudhymenochirin', 'pseudin', 'psyle', 'pyrrhocoricin', 'ranacyclin', 'ranalexin', 'ranatuerin', 'raniseptin', 'rattusin', 'regiiialpha', 'retrocyclin', 'rondonin', 'roseocin', 'rugosin', 'ruminococcin', 'sakacin', 'salivaricin', 'salmocidin', 'sapecin', 'sarconesin', 'sarcotoxin', 'scapularisin', 'scarabaecin', 'schmackerin', 'sclerosin', 'scolopendin', 'scolopendrasin', 'scolopin', 'scygonadin', 'secretolytin', 'seminalplasmin', 'senegalin', 'serrulin', 'sesquin', 'shepherin', 'shuchin', 'siamycin', 'signiferin', 'spiderine', 'spiniferin', 'spinigerin', 'staphylococcin', 'stigmurin', 'stomoxyn', 'streptococcin', 'strongylocin', 'styelin', 'sublancin', 'subtilin', 'subtilomycin', 'subtilosin', 'tachycitin', 'tachyplesin', 'tachystatin', 'teixobactin', 'temporin', 'tenecin', 'termicin', 'thermophilin', 'theromacin', 'thrombocidin', 'thuricin', 'tigerinin', 'tolworthcin', 'trichamide', 'tricholongin', 'tridecaptin', 'triintsin', 'tritrpticin', 'turgencin', 'uberolysin', 'ubiquicidin', 'ubonodin', 'uperin', 'urechistachykinin', 'urumin', 'variacin', 'vasostatin', 'vejovine', 'virescein', 'viscotoxin', 'wollamide']
filtered_peptides = filtering_peptides(df_all_articles,list_peptides)
#Adding a new column named peptide --> list of filtered peptides
df_all_articles["peptide"] = filtered_peptides

In [None]:
#reducing dataframe with articles with at least one peptide 
df_all_articles = df_all_articles[~(df_all_articles["peptide"]=="")]
#converting list of diseases in a joined string
df_all_articles['diseases'] = df_all_articles['diseases'].apply(lambda x:" ".join(x))
#converting list of species
df_all_articles['species'] = df_all_articles['species'].apply(lambda x:" ".join(x))

In [None]:
#pre-process diseases
df_all_articles["diseases"] = df_all_articles["diseases"].apply(lambda x: preprocess(x,nlp))
#pre-process species
df_all_articles["species"] = df_all_articles["species"].apply(lambda x: preprocess(x,nlp))
#eliminate duplicate names of found peptides in an abstract 
df_all_articles["peptide"] = df_all_articles["peptide"].apply(lambda x: " ".join(sorted(set(x.split()), key=x.split().index)))
#eliminate duplicate names of found diseases in an abstract 
df_all_articles["diseases"] = df_all_articles["diseases"].apply(lambda x: " ".join(sorted(set(x.split()), key=x.split().index)))
#eliminate duplicate names of found species
df_all_articles["species"] = df_all_articles["species"].apply(lambda x: " ".join(sorted(set(x.split()), key=x.split().index)))

In [None]:
display(df_all_articles.head())