In [1]:
!ls ../data

'Associated words.xlsx'     'Text mining word list test 200823.xlsx'
 cordis-h2020projects.xlsx   topics_300_SYinput_LW.csv


In [2]:
import sys
sys.path.append("../")

%load_ext autoreload
%autoreload 2

## Imports

In [71]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import numpy as np
from random import sample

import pickle

from gensim.models.wrappers import ldamallet
from gensim.corpora import Dictionary
from gensim.models.wrappers import LdaMallet

from scipy.special import rel_entr
from scipy.spatial.distance import jensenshannon

In [4]:
from src.gensim_helper import create_dictionary, get_coherence
from src.process_data import process_data
from src.artefacts_helper import load_mallet_model
from src.predict import get_term_topics, format_term_search_results, get_all_topics, predict_and_format_topics, get_topics_distribution
from src.train import train_lda_mallet

[nltk_data] Downloading package wordnet to /home/kohkb/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/kohkb/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Import and Process Associated Words

In [5]:
associated_words_df = pd.read_excel('../data/Associated words.xlsx', header=None)
associated_words_df.head()

Unnamed: 0,0,1,2,3,4,5
0,editing,crispr,cas9,base editing,,
1,cancer,carcinoma,p53,tumorigenesis,,
2,cancer,immunotherapy,leukemia,solid,,
3,microbiome,bacteria,antibiotic,probiotic,,
4,RNA,oligonucleotide,non-coding,,,


In [6]:
associated_words = associated_words_df.values
# drop na
associated_words = [i[pd.notna(i)].tolist() for i in associated_words]

In [7]:
associated_words = process_data([' '.join(i) for i in associated_words])

In [8]:
associated_words[:5]

[['editing', 'crispr', 'cas9', 'base', 'editing'],
 ['cancer', 'carcinoma', 'p53', 'tumorigenesis'],
 ['cancer', 'immunotherapy', 'leukemia', 'solid'],
 ['microbiome', 'bacteria', 'antibiotic', 'probiotic'],
 ['rna', 'oligonucleotide', 'non', 'coding']]

## Validate

* Treat `associated_words` as labels i.e. each row is a topic
* Aim is to see among the topics identified by the topic modelling process:
    * which predicted topic best identifies the labelled topic
    * by how much does it identifies the labelled topic
* two metrics are used to measure difference between topics by treating each topic as a probability distribution:
    1. Jensen-Shannon distance, used in LDAvis __(observed that Jensen-Shannon distance is a more stable metric)__
    1. Kullback-Leibler Divergence

### Unfiltered Model

In [9]:
unfiltered_dictionary = Dictionary.load('../artefacts/dictionary')

In [10]:
unfiltered_model = load_mallet_model(artefacts_path='../artefacts', suffix='300_topics_mallet_alpha_50')

In [11]:
unfiltered_model.mallet_path = '../mallet-2.0.8/bin/mallet'
unfiltered_model.prefix = '../artefacts/mallet_tmp/'

In [12]:
unfiltered_model.show_topic(0)

[('political', 0.07788479348027412),
 ('study', 0.0229672161511391),
 ('social', 0.021578070012965363),
 ('politics', 0.020698277458788664),
 ('discourse', 0.017410631598444155),
 ('analysis', 0.015651046490090758),
 ('movement', 0.015002778292276347),
 ('democracy', 0.013382107797740322),
 ('democratic', 0.012548620114836082),
 ('party', 0.011992961659566587)]

In [13]:
# can get probability distribution of a topic in matrix form
unfiltered_model.get_topics()[0][unfiltered_dictionary.doc2idx([i[0] for i in unfiltered_model.show_topic(0)])]

array([0.07788479, 0.02296722, 0.02157807, 0.02069828, 0.01741063,
       0.01565105, 0.01500278, 0.01338211, 0.01254862, 0.01199296])

In [14]:
# variables
unfiltered_result_array = np.zeros((30, unfiltered_model.num_topics, 2))
unfiltered_topic_array = unfiltered_model.get_topics()
unfiltered_label_array = np.zeros((30, len(unfiltered_dictionary.keys())))

# initialize label arrays
for id_i, i in enumerate(associated_words):
    id_list = unfiltered_dictionary.doc2idx(set(i))
    for j in id_list:
        unfiltered_label_array[id_i][j] = 1.0 / len(id_list)

for i in range(30):
    for j in range(unfiltered_model.num_topics):
        jensenshannon_score = jensenshannon(unfiltered_label_array[i], unfiltered_topic_array[j])
        rel_entr_score = sum(rel_entr(unfiltered_label_array[i], unfiltered_topic_array[j]))
        unfiltered_result_array[i][j] = (jensenshannon_score, rel_entr_score)

In [15]:
unfiltered_topics_df = get_all_topics(unfiltered_model).set_index('Topic_Id')
unfiltered_df = unfiltered_topics_df.iloc[unfiltered_result_array[:,:,0].argmin(1)].reset_index()
unfiltered_df['divergence_score'] = unfiltered_result_array[:,:,0].min(1)

In [16]:
unfiltered_df.head()

Unnamed: 0,Topic_Id,Topic_Keywords,divergence_score
0,54,"gene, genetic, genome, expression, mutation, molecular, genomic, crispr, identify, sequencing",0.781123
1,72,"cancer, tumor, tumour, cell, treatment, therapy, patient, breast, therapeutic, metastasis",0.73272
2,72,"cancer, tumor, tumour, cell, treatment, therapy, patient, breast, therapeutic, metastasis",0.720687
3,126,"resistance, antibiotic, bacteria, bacterial, pathogen, infection, resistant, antimicrobial, host, phage",0.736221
4,189,"protein, rna, mrna, function, gene, expression, modification, cell, translation, cellular",0.762226


In [17]:
# for reproducbility. should equal 22.323378320003215
unfiltered_df['divergence_score'].sum()

22.323378320003215

In [42]:
# unfiltered_df.to_csv('../output/df_unfiltered_associated_words.csv', index=False)
# get_all_topics(unfiltered_model).to_csv('../output/unfiltered_300_topics.csv', index=False)

#### Top 10 topics

In [18]:
unfiltered_df.sort_values('divergence_score').head(10)

Unnamed: 0,Topic_Id,Topic_Keywords,divergence_score
12,42,"energy, efficiency, consumption, efficient, saving, renewable, household, sector, demand, consumer",0.646525
24,126,"resistance, antibiotic, bacteria, bacterial, pathogen, infection, resistant, antimicrobial, host, phage",0.651705
17,297,"virus, vaccine, disease, infection, viral, vector, human, hiv, infectious, pathogen",0.671244
8,268,"data, big, analytics, analysis, collection, driven, collected, information, collect, management",0.692078
18,229,"drug, development, discovery, pharmaceutical, compound, screening, target, candidate, molecule, therapeutic",0.70625
28,221,"heart, cardiac, failure, cardiovascular, death, valve, cvd, risk, hf, myocardial",0.714016
29,20,"analysis, method, study, analytical, approach, technique, provide, comprehensive, analyse, methodology",0.717595
2,72,"cancer, tumor, tumour, cell, treatment, therapy, patient, breast, therapeutic, metastasis",0.720687
27,54,"gene, genetic, genome, expression, mutation, molecular, genomic, crispr, identify, sequencing",0.720898
1,72,"cancer, tumor, tumour, cell, treatment, therapy, patient, breast, therapeutic, metastasis",0.73272


#### Worst 10 topics

In [19]:
unfiltered_df.sort_values('divergence_score', ascending=False).head(10)

Unnamed: 0,Topic_Id,Topic_Keywords,divergence_score
25,227,"selection, population, genetic, evolution, variation, trait, fitness, natural, adaptation, parasite",0.81321
16,122,"vision, visual, image, object, eye, computer, world, display, scene, camera",0.795495
13,63,"health, intervention, care, mental, evidence, prevention, risk, outcome, impact, population",0.794194
19,209,"cell, membrane, protein, cellular, lipid, molecular, microscopy, function, biology, vesicle",0.786025
11,266,"life, ageing, people, age, population, aging, related, elderly, older, living",0.784862
21,11,"tissue, cell, organ, vitro, vivo, culture, model, disease, human, kidney",0.783175
0,54,"gene, genetic, genome, expression, mutation, molecular, genomic, crispr, identify, sequencing",0.781123
5,197,"disease, ad, neurodegenerative, al, alzheimer, disorder, therapeutic, model, pathology, dementia",0.776501
15,266,"life, ageing, people, age, population, aging, related, elderly, older, living",0.767273
26,206,"neuron, neuronal, circuit, brain, synaptic, activity, neural, plasticity, function, mouse",0.766088


### Filtered Model

#### Top 3

In [16]:
filtered_dictionary = Dictionary.load('../artefacts/filtered_dictionary')

In [17]:
with open('../artefacts/filtered_docs.pkl', 'rb') as f:
    filtered_docs = pickle.load(f)

print(len(filtered_docs))

19702


In [18]:
filtered_model = load_mallet_model(artefacts_path='../artefacts', suffix='filtered_top_3')

In [19]:
filtered_model.mallet_path = '../mallet-2.0.8/bin/mallet'
filtered_model.prefix = '../artefacts/mallet_tmp/'

In [None]:
# check for reproducibility
get_coherence(filtered_model, filtered_docs, filtered_dictionary)

In [20]:
filtered_model.show_topic(0)

[('wave', 0.0855628796691224),
 ('frequency', 0.08297789841023652),
 ('high', 0.03786997544267804),
 ('based', 0.027788548533023136),
 ('application', 0.0264960579035802),
 ('generation', 0.025332816337081555),
 ('comb', 0.024298823833527206),
 ('dual', 0.022360087889362803),
 ('source', 0.020679850071086983),
 ('propagation', 0.020421351945198396)]

In [21]:
# variables
filtered_result_array = np.zeros((30, filtered_model.num_topics, 2))
filtered_topic_array = filtered_model.get_topics()
filtered_label_array = np.zeros((30, len(filtered_dictionary.keys())))

# initialize label arrays
for id_i, i in enumerate(associated_words):
    id_list = filtered_dictionary.doc2idx(set(i))
    for j in id_list:
        filtered_label_array[id_i][j] = 1.0 / len(id_list)

for i in range(30):
    for j in range(filtered_model.num_topics):
        jensenshannon_score = jensenshannon(filtered_label_array[i], filtered_topic_array[j])
        rel_entr_score = sum(rel_entr(filtered_label_array[i], filtered_topic_array[j]))
        filtered_result_array[i][j] = (jensenshannon_score, rel_entr_score)

In [22]:
filtered_topics_df = get_all_topics(filtered_model).set_index('Topic_Id')
filtered_df = filtered_topics_df.iloc[filtered_result_array[:,:,0].argmin(1)].reset_index()
filtered_df['divergence_score'] = filtered_result_array[:,:,0].min(1)

In [23]:
filtered_df.head()

Unnamed: 0,Topic_Id,Topic_Keywords,divergence_score
0,40,"gene, genome, expression, genetic, mutation, crispr, editing, functional, identify, cas9",0.757357
1,228,"cancer, tumor, tumour, breast, therapy, cell, treatment, metastasis, metastatic, patient",0.696356
2,228,"cancer, tumor, tumour, breast, therapy, cell, treatment, metastasis, metastatic, patient",0.704889
3,109,"resistance, antibiotic, bacteria, bacterial, infection, resistant, antimicrobial, pathogen, pa, toxin",0.720164
4,111,"rna, mrna, translation, coding, function, regulation, expression, small, mirnas, protein",0.722837


In [24]:
# for reproducibility. should equal 21.853238639749286
filtered_df['divergence_score'].sum()

21.853238639749286

In [18]:
# filtered_df.to_csv('../output/df_filtered_associated_words.csv', index=False)
# get_all_topics(filtered_model).to_csv('../output/df_filtered_topics.csv', index=False)

##### Top 10 topics

In [30]:
filtered_df.sort_values('divergence_score').head(10)

Unnamed: 0,Topic_Id,Topic_Keywords,divergence_score
24,109,"resistance, antibiotic, bacteria, bacterial, infection, resistant, antimicrobial, pathogen, pa, toxin",0.620899
17,13,"vaccine, virus, viral, infection, hiv, response, antiviral, vaccination, human, influenza",0.634735
12,22,"energy, solar, efficiency, power, grid, efficient, consumption, electricity, harvesting, renewable",0.684919
27,40,"gene, genome, expression, genetic, mutation, crispr, editing, functional, identify, cas9",0.692319
18,192,"drug, discovery, development, pharmaceutical, compound, screening, molecule, target, candidate, pharma",0.692936
1,228,"cancer, tumor, tumour, breast, therapy, cell, treatment, metastasis, metastatic, patient",0.696356
8,127,"data, big, analytics, processing, analysis, information, driven, amount, source, business",0.698552
28,248,"heart, cardiac, failure, disease, cardiovascular, death, hf, af, clinical, myocardial",0.704877
2,228,"cancer, tumor, tumour, breast, therapy, cell, treatment, metastasis, metastatic, patient",0.704889
22,18,"metabolic, diabetes, metabolism, glucose, insulin, disease, type, beta, metabolite, diabetic",0.716179


##### Worst 10 topics

In [31]:
filtered_df.sort_values('divergence_score', ascending=False).head(10)

Unnamed: 0,Topic_Id,Topic_Keywords,divergence_score
25,7,"host, pathogen, disease, infection, parasite, vector, malaria, transmission, effector, infectious",0.783972
26,260,"neuron, neuronal, circuit, brain, activity, neural, plasticity, mouse, cortical, functional",0.773999
19,223,"resolution, imaging, microscopy, high, technique, super, microscope, fluorescence, live, electron",0.770918
9,143,"sex, female, reproductive, male, sexual, woman, reproduction, gender, fertility, specific",0.76937
7,40,"gene, genome, expression, genetic, mutation, crispr, editing, functional, identify, cas9",0.763956
5,14,"disease, ad, neurodegenerative, alzheimer, al, model, pathology, brain, parkinson, disorder",0.762077
11,256,"population, age, ageing, aging, related, life, healthy, older, decline, adult",0.761539
15,256,"population, age, ageing, aging, related, life, healthy, older, decline, adult",0.758583
21,138,"skin, organ, kidney, transplantation, fibrosis, injury, renal, human, tissue, lung",0.75779
0,40,"gene, genome, expression, genetic, mutation, crispr, editing, functional, identify, cas9",0.757357


#### Top 1

In [9]:
filtered_dictionary = Dictionary.load('../artefacts/filtered_top_1_dictionary')

In [10]:
with open('../artefacts/filtered_top_1_docs.pkl', 'rb') as f:
    filtered_docs = pickle.load(f)

print(len(filtered_docs))

13086


In [11]:
filtered_model = load_mallet_model(artefacts_path='../artefacts', suffix='filtered_top_1')

In [12]:
filtered_model.mallet_path = '../mallet-2.0.8/bin/mallet'
filtered_model.prefix = '../artefacts/mallet_tmp/'

In [29]:
# check for reproducibility, should be 0.505769846580301
get_coherence(filtered_model, filtered_docs, filtered_dictionary)

0.505769846580301

In [13]:
filtered_model.show_topic(0)

[('circuit', 0.055449184345561765),
 ('brain', 0.040131730106456306),
 ('neuron', 0.0350769702075515),
 ('cortical', 0.030328559393428812),
 ('cortex', 0.02864363942712721),
 ('activity', 0.027877766715171937),
 ('neuronal', 0.022286895917898446),
 ('mouse', 0.020831737765183427),
 ('neural', 0.020678563222792372),
 ('input', 0.015164279696714406)]

In [14]:
# variables
filtered_result_array = np.zeros((30, filtered_model.num_topics, 2))
filtered_topic_array = filtered_model.get_topics()
filtered_label_array = np.zeros((30, len(filtered_dictionary.keys())))

# initialize label arrays
for id_i, i in enumerate(associated_words):
    id_list = filtered_dictionary.doc2idx(set(i))
    for j in id_list:
        filtered_label_array[id_i][j] = 1.0 / len(id_list)

for i in range(30):
    for j in range(filtered_model.num_topics):
        jensenshannon_score = jensenshannon(filtered_label_array[i], filtered_topic_array[j])
        rel_entr_score = sum(rel_entr(filtered_label_array[i], filtered_topic_array[j]))
        filtered_result_array[i][j] = (jensenshannon_score, rel_entr_score)

In [15]:
filtered_topics_df = get_all_topics(filtered_model).set_index('Topic_Id')
filtered_df = filtered_topics_df.iloc[filtered_result_array[:,:,0].argmin(1)].reset_index()
filtered_df['divergence_score'] = filtered_result_array[:,:,0].min(1)

In [16]:
filtered_df.head()

Unnamed: 0,Topic_Id,Topic_Keywords,divergence_score
0,26,"crispr, editing, cell, cas9, gene, genome, vesicle, ev, extracellular, function",0.686946
1,145,"cancer, breast, screening, woman, pancreatic, death, clinical, colorectal, case, pca",0.703033
2,145,"cancer, breast, screening, woman, pancreatic, death, clinical, colorectal, case, pca",0.703033
3,294,"resistance, antibiotic, resistant, antimicrobial, infection, pathogen, bacteria, treatment, health, amr",0.728827
4,43,"rna, mrna, protein, expression, gene, coding, regulation, small, seq, splicing",0.710122


In [17]:
# for reproducibility. should equal 21.60336477097165
filtered_df['divergence_score'].sum()

21.60336477097165

## Comparison

In [32]:
compare_df = unfiltered_df.merge(filtered_df, left_index=True, right_index=True, suffixes=('_unfiltered', '_filtered'))

In [33]:
compare_df.insert(0, 'associated_words', [set(i) for i in associated_words])

In [34]:
compare_df['score_diff'] = compare_df['divergence_score_filtered'] - compare_df['divergence_score_unfiltered']

In [35]:
compare_df.sort_values('score_diff')

Unnamed: 0,associated_words,Topic_Id_unfiltered,Topic_Keywords_unfiltered,divergence_score_unfiltered,Topic_Id_filtered,Topic_Keywords_filtered,divergence_score_filtered,score_diff
16,"{retinal, eye, ophthalmic}",122,"vision, visual, image, object, eye, computer, world, display, scene, camera",0.795495,268,"vision, eye, visual, retinal, lens, tracking, head, blind, retina, world",0.721102,-0.074393
13,"{mental, psychological, psychiatric}",63,"health, intervention, care, mental, evidence, prevention, risk, outcome, impact, population",0.794194,89,"disorder, mental, depression, autism, symptom, anxiety, deficit, asd, psychiatric, syndrome",0.74602,-0.048174
4,"{oligonucleotide, coding, non, rna}",189,"protein, rna, mrna, function, gene, expression, modification, cell, translation, cellular",0.762226,111,"rna, mrna, translation, coding, function, regulation, expression, small, mirnas, protein",0.722837,-0.039389
17,"{immunity, vaccine, virus, infection}",297,"virus, vaccine, disease, infection, viral, vector, human, hiv, infectious, pathogen",0.671244,13,"vaccine, virus, viral, infection, hiv, response, antiviral, vaccination, human, influenza",0.634735,-0.036509
1,"{p53, carcinoma, tumorigenesis, cancer}",72,"cancer, tumor, tumour, cell, treatment, therapy, patient, breast, therapeutic, metastasis",0.73272,228,"cancer, tumor, tumour, breast, therapy, cell, treatment, metastasis, metastatic, patient",0.696356,-0.036364
24,"{antibiotic, bacteria, resistance}",126,"resistance, antibiotic, bacteria, bacterial, pathogen, infection, resistant, antimicrobial, host, phage",0.651705,109,"resistance, antibiotic, bacteria, bacterial, infection, resistant, antimicrobial, pathogen, pa, toxin",0.620899,-0.030807
25,"{mosquito, malaria, dengue}",227,"selection, population, genetic, evolution, variation, trait, fitness, natural, adaptation, parasite",0.81321,7,"host, pathogen, disease, infection, parasite, vector, malaria, transmission, effector, infectious",0.783972,-0.029239
27,"{regulation, therapy, gene, expression}",54,"gene, genetic, genome, expression, mutation, molecular, genomic, crispr, identify, sequencing",0.720898,40,"gene, genome, expression, genetic, mutation, crispr, editing, functional, identify, cas9",0.692319,-0.028579
22,"{diabetes, insulin}",289,"diabetes, disease, obesity, glucose, type, insulin, factor, risk, metabolic, beta",0.743058,18,"metabolic, diabetes, metabolism, glucose, insulin, disease, type, beta, metabolite, diabetic",0.716179,-0.026879
21,"{fibrosis, interleukin, organ}",11,"tissue, cell, organ, vitro, vivo, culture, model, disease, human, kidney",0.783175,138,"skin, organ, kidney, transplantation, fibrosis, injury, renal, human, tissue, lung",0.75779,-0.025386


In [41]:
# compare_df.to_csv('../output/associated_words_comparison.csv', index=False)
# compare_df.to_excel('../output/comparison.xlsx', index=False)

## Hyperparameter Tuning

Parameters we can use to tune:
[LDAMALLET Documentation](https://radimrehurek.com/gensim/models/wrappers/ldamallet.html)
* num_topics? (With larger topics, probability distribution of terms within topics will always become more specific/granular
* alpha
* Improving associated words/labels list

In [9]:
def get_divergence_scores(dictionary: Dictionary, model: LdaMallet, associated_words: list):
    # variables
    result_array = np.zeros((len(associated_words), model.num_topics))
    topic_array = model.get_topics()
    label_array = np.zeros((len(associated_words), len(dictionary.keys())))

    # initialize label arrays
    for id_i, i in enumerate(associated_words):
        id_list = dictionary.doc2idx(set(i))
        for j in id_list:
            label_array[id_i][j] = 1.0 / len(id_list)

    for i in range(len(associated_words)):
        for j in range(model.num_topics):
            jensenshannon_score = jensenshannon(label_array[i], topic_array[j])
            result_array[i][j] = jensenshannon_score
    
    return result_array

def get_min_divergence_scores(divergence_scores: np.array):
    return divergence_scores.min(1).sum()

def format_divergence_scores(model: LdaMallet, divergence_scores: np.array):
    topics_df = get_all_topics(model).set_index('Topic_Id')
    df = topics_df.iloc[divergence_scores.argmin(1)].reset_index()
    df['divergence_score'] = divergence_scores.min(1)
    return df

### Tuning for n_topics and alpha

small alpha is used for few topics per document and big alpha is used for many topics per document.

In [40]:
# Bag-of-words representation of the documents.
filtered_corpus = [filtered_dictionary.doc2bow(doc) for doc in filtered_docs]
print(len(filtered_corpus))

13086


In [41]:
%%time 

alpha_params = [0.01, 0.1, 1, 10, 50, 100]
scores = []

for i in alpha_params:
    model = train_lda_mallet(filtered_corpus, filtered_dictionary, 300, 
                         params={
                             'mallet_path': '../mallet-2.0.8/bin/mallet',
                             'prefix_path': '../artefacts/mallet_tmp/',
                             'alpha': i
                         })
    
    scores_array = get_divergence_scores(filtered_dictionary, model, associated_words)
    scores.append(get_min_divergence_scores(scores_array))

CPU times: user 55.1 s, sys: 439 ms, total: 55.5 s
Wall time: 7min 8s


In [42]:
scores

[24.203720396478587,
 23.85662612462584,
 22.957363907239817,
 21.915334469860646,
 21.60336477097165,
 21.36322391403247]

In [43]:
%%time 

alpha_params = [50, 70, 100, 120, 150, 200]
scores = []

for i in alpha_params:
    model = train_lda_mallet(filtered_corpus, filtered_dictionary, 300, 
                         params={
                             'mallet_path': '../mallet-2.0.8/bin/mallet',
                             'prefix_path': '../artefacts/mallet_tmp/',
                             'alpha': i
                         })
    
    scores_array = get_divergence_scores(filtered_dictionary, model, associated_words)
    scores.append(get_min_divergence_scores(scores_array))

CPU times: user 55.9 s, sys: 607 ms, total: 56.5 s
Wall time: 5min 49s


In [44]:
scores

[21.60336477097165,
 21.511106322058342,
 21.36322391403247,
 21.347289505726124,
 21.23431997151852,
 21.173681642711934]

In [45]:
%%time 

alpha_params = [200, 250, 300, 400, 500]
scores = []

for i in alpha_params:
    model = train_lda_mallet(filtered_corpus, filtered_dictionary, 300, 
                         params={
                             'mallet_path': '../mallet-2.0.8/bin/mallet',
                             'prefix_path': '../artefacts/mallet_tmp/',
                             'alpha': i
                         })
    
    scores_array = get_divergence_scores(filtered_dictionary, model, associated_words)
    scores.append(get_min_divergence_scores(scores_array))

CPU times: user 45.2 s, sys: 364 ms, total: 45.6 s
Wall time: 4min 52s


In [46]:
scores

[21.173681642711934,
 21.1034699289655,
 21.282616113016783,
 21.33287213236462,
 21.135819968729002]

### Optimized Model

In [10]:
with open('../artefacts/filtered_top_1_docs.pkl', 'rb') as f:
    filtered_docs = pickle.load(f)

print(len(filtered_docs))

filtered_dictionary = Dictionary.load('../artefacts/filtered_top_1_dictionary')

13086


In [11]:
# Bag-of-words representation of the documents.
filtered_corpus = [filtered_dictionary.doc2bow(doc) for doc in filtered_docs]
print(len(filtered_corpus))

optimized_model = train_lda_mallet(filtered_corpus, filtered_dictionary, 300, 
                         params={
                             'mallet_path': '../mallet-2.0.8/bin/mallet',
                             'prefix_path': '../artefacts/mallet_tmp/',
                             'alpha': 250
                         })
optimized_score_array = get_divergence_scores(filtered_dictionary, optimized_model, associated_words)

# for reproducibility. should equal 21.1034699289655
print(get_min_divergence_scores(optimized_score_array))

13086
21.1034699289655


In [12]:
topics_result = []

for topic_id, weights in optimized_model.show_topics(-1, formatted=False):
    weights = np.array(weights).flatten().tolist()
    for i in range(1, 20, 2):
        weights[i] = round(float(weights[i]), 3)
    topics_result.append([topic_id] + np.array(weights).flatten().tolist())

columns = ['Topic']

for i in range(10):
    columns = columns + [f'Word{i+1}', f'Prob{i+1}']


df = pd.DataFrame(data=topics_result, columns=columns)

In [13]:
df.head()

Unnamed: 0,Topic,Word1,Prob1,Word2,Prob2,Word3,Prob3,Word4,Prob4,Word5,...,Word6,Prob6,Word7,Prob7,Word8,Prob8,Word9,Prob9,Word10,Prob10
0,0,aspect,0.116,modern,0.109,part,0.095,idea,0.079,main,...,goal,0.049,practical,0.047,proposed,0.036,technique,0.03,general,0.018
1,1,life,0.285,infant,0.056,period,0.043,developmental,0.039,birth,...,maternal,0.034,pregnancy,0.032,mother,0.024,baby,0.02,born,0.019
2,2,cell,0.293,differentiation,0.07,fate,0.049,niche,0.047,heterogeneity,...,lineage,0.033,cellular,0.031,developmental,0.029,progenitor,0.025,somatic,0.016
3,3,enable,0.154,micro,0.117,capable,0.08,perform,0.066,capability,...,inside,0.055,progress,0.05,develop,0.042,made,0.041,patch,0.033
4,4,protein,0.558,substrate,0.039,spectrometry,0.031,mass,0.03,ubiquitin,...,proteome,0.018,degradation,0.014,biochemical,0.013,phosphorylation,0.012,proteasome,0.01


In [28]:
df.to_excel('../output/optimized_300_topics_with_word_prob.xlsx', index=False)

In [17]:
# get_all_topics(optimized_model).to_csv('../output/optimized_300_topics.csv', index=False)
get_all_topics(optimized_model).head()

Unnamed: 0,Topic_Id,Topic_Keywords
0,0,"aspect, modern, part, idea, main, goal, practical, proposed, technique, general"
1,1,"life, infant, period, developmental, birth, maternal, pregnancy, mother, baby, born"
2,2,"cell, differentiation, fate, niche, heterogeneity, lineage, cellular, developmental, progenitor, somatic"
3,3,"enable, micro, capable, perform, capability, inside, progress, develop, made, patch"
4,4,"protein, substrate, spectrometry, mass, ubiquitin, proteome, degradation, biochemical, phosphorylation, proteasome"


In [18]:
temp_df = format_divergence_scores(optimized_model, optimized_score_array)
temp_df.insert(0, 'associated_words', [set(i) for i in associated_words])
temp_df

Unnamed: 0,associated_words,Topic_Id,Topic_Keywords,divergence_score
0,"{base, cas9, crispr, editing}",139,"dna, genome, epigenetic, crispr, repair, editing, histone, cas9, genomic, damage",0.734616
1,"{p53, carcinoma, cancer, tumorigenesis}",12,"cancer, tumor, tumour, metastasis, metastatic, breast, progression, chemotherapy, anti, survival",0.676048
2,"{immunotherapy, cancer, leukemia, solid}",12,"cancer, tumor, tumour, metastasis, metastatic, breast, progression, chemotherapy, anti, survival",0.690754
3,"{microbiome, bacteria, antibiotic, probiotic}",80,"resistance, antibiotic, resistant, bacteria, antimicrobial, bacterial, amr, global, threat, emergence",0.704846
4,"{coding, non, rna, oligonucleotide}",122,"rna, mrna, sequence, coding, nmr, sequencing, splicing, motif, seq, binding",0.670497
5,"{parkinson, alzheimer, neurodegeneration, huntington}",289,"disease, signaling, pathology, al, neurodegenerative, parkinson, therapeutic, aggregation, neurodegeneration, alzheimer",0.754143
6,"{vector, aav, gene, immunity, delivery, immune}",17,"gene, expression, regulatory, genome, transcriptional, transcription, regulation, enhancer, promoter, silencing",0.72558
7,"{polymer, viral, nanoparticle, non, gene, delivery, liposome}",17,"gene, expression, regulatory, genome, transcriptional, transcription, regulation, enhancer, promoter, silencing",0.735572
8,"{genomics, data, multiomics, prevention, precision}",46,"data, big, analytics, datasets, mining, collection, collected, sharing, repository, query",0.676922
9,"{pregnancy, reproductive, obstetric, woman, prenatal}",57,"breast, positive, woman, common, result, benefit, potential, lesion, significant, targeted",0.782717


In [21]:
temp_df.to_csv('../output/associated_words_divergence_score_optimized.csv', index=False)

## Predict and Format

In [16]:
sheets_dict = pd.read_excel('../data/cordis-h2020projects.xlsx', None)
df = sheets_dict['cordis-h2020projects']

# combine title and objective
data = (df['title'] + ' ' + df['objective']).values.tolist()

docs = process_data(data)
corpus = [filtered_dictionary.doc2bow(doc) for doc in docs]

### 30k grants prediction

In [17]:
%%time
predict_df = predict_and_format_topics(optimized_model, corpus, data, n_topics=10)

CPU times: user 16min 55s, sys: 14.4 s, total: 17min 10s
Wall time: 18min 8s


In [96]:
rename_columns_dict = dict([(f'Dominant_Topic_{i+1}', f'Topic_Id_{i+1}') for i in range(5)])
rename_columns_dict

{'Dominant_Topic_1': 'Topic_Id_1',
 'Dominant_Topic_2': 'Topic_Id_2',
 'Dominant_Topic_3': 'Topic_Id_3',
 'Dominant_Topic_4': 'Topic_Id_4',
 'Dominant_Topic_5': 'Topic_Id_5'}

In [97]:
predict_df.rename(columns=rename_columns_dict, inplace=True)

In [98]:
predict_df.head(2)

Unnamed: 0,Document_No,Topic_Id_1,Topic_Prob_1,Topic Keywords,Topic_Id_2,Topic_Prob_2,Topic Keywords.1,Topic_Id_3,Topic_Prob_3,Topic Keywords.2,Topic_Id_4,Topic_Prob_4,Topic Keywords.3,Topic_Id_5,Topic_Prob_5,Topic Keywords.4
0,0,0.0,0.0416,"aspect, modern, part, idea, main, goal, practical, proposed, technique, general",86.0,0.0326,"translation, post, translational, identify, basic, target, profiling, biology, translate, previously",181.0,0.0198,"scientific, community, scientist, knowledge, collaboration, enhance, dissemination, programme, excellence, achievement",231.0,0.0198,"language, linguistic, semantic, word, meaning, acquisition, reading, text, representation, speaker",141.0,0.0173,"component, communication, sensing, key, ph, sense, combine, versatile, typically, dedicated"
1,1,181.0,0.0435,"scientific, community, scientist, knowledge, collaboration, enhance, dissemination, programme, excellence, achievement",166.0,0.0162,"experience, virtual, reality, team, tracking, interactive, phage, ar, vr, sport",215.0,0.0156,"technology, enabling, technological, demonstrate, breakthrough, disruptive, enabled, radically, realising, world",290.0,0.0145,"brain, connectivity, functional, function, behavioural, neuroimaging, region, cerebral, causal, underlying",217.0,0.0111,"training, research, researcher, programme, academic, train, institution, trained, esr, expertise"


In [88]:
data_df_with_predict = df.merge(predict_df.drop(columns=['Document_No']), left_index=True, right_index=True)
# data_df_with_predict.to_excel('../output/optimized_30k_top_5_topics_predictions.xlsx', index=False)

### Filtered grants prediction

In [42]:
with open('../artefacts/filtered_top_doc_id.pkl', 'rb') as f:
    filtered_doc_id = pickle.load(f)

print(len(filtered_doc_id))

13086


In [90]:
filtered_data_df_with_predict = data_df_with_predict.iloc[filtered_doc_id]
# filtered_data_df_with_predict.to_excel('../output/optimized_13k_top_5_topics_predictions.xlsx', index=False)

## Topic Distribution

In [18]:
predict_df.head(3)

Unnamed: 0,Document_No,Dominant_Topic_1,Topic_Prob_1,Topic Keywords,Dominant_Topic_2,Topic_Prob_2,Topic Keywords.1,Dominant_Topic_3,Topic_Prob_3,Topic Keywords.2,...,Topic Keywords.3,Dominant_Topic_8,Topic_Prob_8,Topic Keywords.4,Dominant_Topic_9,Topic_Prob_9,Topic Keywords.5,Dominant_Topic_10,Topic_Prob_10,Topic Keywords.6
0,0,0.0,0.0416,"aspect, modern, part, idea, main, goal, practical, proposed, technique, general",86.0,0.0326,"translation, post, translational, identify, basic, target, profiling, biology, translate, previously",181.0,0.0198,"scientific, community, scientist, knowledge, collaboration, enhance, dissemination, programme, excellence, achievement",...,"partner, consortium, industrial, academic, smes, expertise, implement, provided, industry, board",211.0,0.0135,"innovative, international, contribute, future, good, aim, leading, profile, providing, multidisciplinary",217.0,0.013,"training, research, researcher, programme, academic, train, institution, trained, esr, expertise",140.0,0.0124,"social, society, cooperation, cultural, context, framework, practice, engagement, engage, gender"
1,1,181.0,0.0435,"scientific, community, scientist, knowledge, collaboration, enhance, dissemination, programme, excellence, achievement",166.0,0.0162,"experience, virtual, reality, team, tracking, interactive, phage, ar, vr, sport",215.0,0.0156,"technology, enabling, technological, demonstrate, breakthrough, disruptive, enabled, radically, realising, world",...,"software, hardware, code, error, programming, computing, verification, execution, developer, run",211.0,0.0105,"innovative, international, contribute, future, good, aim, leading, profile, providing, multidisciplinary",253.0,0.0094,"interaction, understanding, understand, investigation, interacting, investigate, fundamentally, explored, interplay, studied",236.0,0.0091,"mobility, public, citizen, private, society, urban, crc, city, support, supporting"
2,2,237.0,0.0527,"health, intervention, public, country, prevention, effectiveness, setting, fire, income, implementation",182.0,0.0473,"policy, stakeholder, sustainable, economic, maker, socio, sustainability, support, actor, framework",176.0,0.0316,"outcome, assessment, implementation, evaluate, ass, cns, successful, nervous, facilitate, assessing",...,"strategy, major, strain, represent, responsible, ability, tb, spread, escape, combat",102.0,0.0138,"scale, large, local, exosomes, multi, exosome, spanning, picture, mesoscale, covering",39.0,0.0125,"region, key, transmission, including, integrating, emerging, address, africa, african, fly",188.0,0.0117,"impact, major, ass, including, societal, assessed, toolkit, innovative, huge, provide"


In [30]:
get_topics_distribution(optimized_model, predict_df, n_topics=5).head()

Unnamed: 0,Topic_Id,Topic_Keywords,top_1_topics,top_2_topics,top_3_topics,top_4_topics,top_5_topics
0,0,"aspect, modern, part, idea, main, goal, practical, proposed, technique, general",10.0,52.0,131.0,215.0,302.0
1,1,"life, infant, period, developmental, birth, maternal, pregnancy, mother, baby, born",80.0,136.0,203.0,247.0,299.0
2,2,"cell, differentiation, fate, niche, heterogeneity, lineage, cellular, developmental, progenitor, somatic",73.0,181.0,255.0,332.0,407.0
3,3,"enable, micro, capable, perform, capability, inside, progress, develop, made, patch",25.0,62.0,121.0,179.0,249.0
4,4,"protein, substrate, spectrometry, mass, ubiquitin, proteome, degradation, biochemical, phosphorylation, proteasome",121.0,244.0,371.0,472.0,562.0


In [115]:
get_topics_distribution(optimized_model, predict_df, n_topics=5).to_csv('../output/topics_distribution.csv', index=False)

## Eyeballing some specific keywords

In [62]:
def display_topic_distribution_statistics(docs: list, predict_df: pd.DataFrame, keyword: str, n_topics, return_fp=False):
    print(f'Looking at the top {n_topics} topics that documents are classified under')
    print(f'keyword: {keyword}')
    keyword_ids = set([id_i for id_i, i in enumerate(docs) if keyword in i])
    print(f'number of unique documents with \'{keyword}\': {len(keyword_ids)}')
    
    temp_model = ldamallet.malletmodel2ldamodel(optimized_model)
    keywords_topics = get_term_topics(temp_model, filtered_dictionary, keyword)
    
    if keywords_topics:
        for topic_id, topic_prob in keywords_topics:
            print(f'topic_id: {topic_id}')
            print(f'topic_prob: {topic_prob}')
            print('topic:', optimized_model.print_topic(topic_id))

            _ = predict_df['Dominant_Topic_1'] == topic_id
            for i in range(n_topics):
                _ = _ | (predict_df[f'Dominant_Topic_{i+1}'] == topic_id)
    
            print()
            print(f"Number of unique documents classified under topic: {len(set(predict_df[_].index))}")
        
            n_positive = len(keyword_ids)
            n_negative = len(predict_df) - len(keyword_ids)
            
            true_positive = set(predict_df[_].index).intersection(keyword_ids)
            false_negative = keyword_ids - set(predict_df[_].index)
            false_positive = set(predict_df[_].index) - keyword_ids
            
            true_positive_rate = round(len(true_positive) / n_positive, 4)
            false_negative_rate = round(len(false_negative) / n_positive, 4)
            false_positive_rate = round(len(false_positive) / n_negative, 4)
            true_negative_rate = round((n_negative - len(false_positive)) / n_negative, 4)
            
            print(f"True Positive: {len(true_positive)} / {n_positive} = {true_positive_rate}")
            print(f"False Negative: {len(false_negative)} / {n_positive} = {false_negative_rate}")
            print(f"False Positive: {len(false_positive)} / {n_negative} = {false_positive_rate}")
            print(f"True Negative: {n_negative - len(false_positive)} / {n_negative} = {true_negative_rate}")
            print()
            
            if return_fp:
                return list(false_positive)
    else:
        print('No topics Found for keyword')

### Unfiltered 30k grants

In [32]:
display_topic_distribution_statistics(docs, predict_df, 'crispr', 5)

Looking at the top 5 topics that documents are classified under
keyword: crispr
number of unique documents with 'crispr': 258
topic_id: 139
topic_prob: 0.04623544156758605
topic: 0.263*"dna" + 0.107*"genome" + 0.090*"epigenetic" + 0.046*"crispr" + 0.043*"repair" + 0.030*"editing" + 0.029*"histone" + 0.025*"cas9" + 0.023*"genomic" + 0.018*"damage"

Number of unique documents classified under topic: 536
True Positive: 107 / 258 = 0.4147
False Negative: 151 / 258 = 0.5853
False Positive: 429 / 29826 = 0.0144
True Negative: 29397 / 29826 = 0.9856



In [33]:
display_topic_distribution_statistics(docs, predict_df, 'crispr', 3)

Looking at the top 3 topics that documents are classified under
keyword: crispr
number of unique documents with 'crispr': 258
topic_id: 139
topic_prob: 0.04623544156758605
topic: 0.263*"dna" + 0.107*"genome" + 0.090*"epigenetic" + 0.046*"crispr" + 0.043*"repair" + 0.030*"editing" + 0.029*"histone" + 0.025*"cas9" + 0.023*"genomic" + 0.018*"damage"

Number of unique documents classified under topic: 392
True Positive: 75 / 258 = 0.2907
False Negative: 183 / 258 = 0.7093
False Positive: 317 / 29826 = 0.0106
True Negative: 29509 / 29826 = 0.9894



In [34]:
display_topic_distribution_statistics(docs, predict_df, 'crispr', 2)

Looking at the top 2 topics that documents are classified under
keyword: crispr
number of unique documents with 'crispr': 258
topic_id: 139
topic_prob: 0.04623544156758605
topic: 0.263*"dna" + 0.107*"genome" + 0.090*"epigenetic" + 0.046*"crispr" + 0.043*"repair" + 0.030*"editing" + 0.029*"histone" + 0.025*"cas9" + 0.023*"genomic" + 0.018*"damage"

Number of unique documents classified under topic: 299
True Positive: 60 / 258 = 0.2326
False Negative: 198 / 258 = 0.7674
False Positive: 239 / 29826 = 0.008
True Negative: 29587 / 29826 = 0.992



In [35]:
display_topic_distribution_statistics(docs, predict_df, 'rna', 5)

Looking at the top 5 topics that documents are classified under
keyword: rna
number of unique documents with 'rna': 610
topic_id: 122
topic_prob: 0.23486118551768878
topic: 0.235*"rna" + 0.061*"mrna" + 0.061*"sequence" + 0.054*"coding" + 0.046*"nmr" + 0.023*"sequencing" + 0.020*"splicing" + 0.017*"motif" + 0.014*"seq" + 0.013*"binding"

Number of unique documents classified under topic: 362
True Positive: 237 / 610 = 0.3885
False Negative: 373 / 610 = 0.6115
False Positive: 125 / 29474 = 0.0042
True Negative: 29349 / 29474 = 0.9958



In [36]:
display_topic_distribution_statistics(docs, predict_df, 'mrna', 5)

Looking at the top 5 topics that documents are classified under
keyword: mrna
number of unique documents with 'mrna': 168
topic_id: 122
topic_prob: 0.06070102742422468
topic: 0.235*"rna" + 0.061*"mrna" + 0.061*"sequence" + 0.054*"coding" + 0.046*"nmr" + 0.023*"sequencing" + 0.020*"splicing" + 0.017*"motif" + 0.014*"seq" + 0.013*"binding"

Number of unique documents classified under topic: 362
True Positive: 90 / 168 = 0.5357
False Negative: 78 / 168 = 0.4643
False Positive: 272 / 29916 = 0.0091
True Negative: 29644 / 29916 = 0.9909



In [37]:
display_topic_distribution_statistics(docs, predict_df, 'aav', 5)

Looking at the top 5 topics that documents are classified under
keyword: aav
number of unique documents with 'aav': 18
No topics Found for keyword


In [38]:
display_topic_distribution_statistics(docs, predict_df, 'alzheimer', 5)

Looking at the top 5 topics that documents are classified under
keyword: alzheimer
number of unique documents with 'alzheimer': 236
topic_id: 13
topic_prob: 0.025342347922701028
topic: 0.152*"disease" + 0.129*"biomarkers" + 0.117*"assay" + 0.086*"ad" + 0.083*"clinical" + 0.064*"biomarker" + 0.045*"validate" + 0.037*"validation" + 0.025*"alzheimer" + 0.025*"panel"

Number of unique documents classified under topic: 323
True Positive: 89 / 236 = 0.3771
False Negative: 147 / 236 = 0.6229
False Positive: 234 / 29848 = 0.0078
True Negative: 29614 / 29848 = 0.9922

topic_id: 289
topic_prob: 0.018681563217691884
topic: 0.240*"disease" + 0.088*"signaling" + 0.055*"pathology" + 0.047*"al" + 0.046*"neurodegenerative" + 0.030*"parkinson" + 0.025*"therapeutic" + 0.024*"aggregation" + 0.020*"neurodegeneration" + 0.019*"alzheimer"

Number of unique documents classified under topic: 396
True Positive: 119 / 236 = 0.5042
False Negative: 117 / 236 = 0.4958
False Positive: 277 / 29848 = 0.0093
True Nega

In [39]:
display_topic_distribution_statistics(docs, predict_df, 'cas9', 5)

Looking at the top 5 topics that documents are classified under
keyword: cas9
number of unique documents with 'cas9': 164
topic_id: 139
topic_prob: 0.024471102910912392
topic: 0.263*"dna" + 0.107*"genome" + 0.090*"epigenetic" + 0.046*"crispr" + 0.043*"repair" + 0.030*"editing" + 0.029*"histone" + 0.025*"cas9" + 0.023*"genomic" + 0.018*"damage"

Number of unique documents classified under topic: 536
True Positive: 76 / 164 = 0.4634
False Negative: 88 / 164 = 0.5366
False Positive: 460 / 29920 = 0.0154
True Negative: 29460 / 29920 = 0.9846



### Filtered 13K Grants

In [43]:
filtered_predict_df = predict_df.iloc[filtered_doc_id].reset_index(drop=True)

In [44]:
display_topic_distribution_statistics(filtered_docs, filtered_predict_df, 'crispr', 5)

Looking at the top 5 topics that documents are classified under
keyword: crispr
number of unique documents with 'crispr': 246
topic_id: 139
topic_prob: 0.04623544156758605
topic: 0.263*"dna" + 0.107*"genome" + 0.090*"epigenetic" + 0.046*"crispr" + 0.043*"repair" + 0.030*"editing" + 0.029*"histone" + 0.025*"cas9" + 0.023*"genomic" + 0.018*"damage"

Number of unique documents classified under topic: 487
True Positive: 103 / 246 = 0.4187
False Negative: 143 / 246 = 0.5813
False Positive: 384 / 12840 = 0.0299
True Negative: 12456 / 12840 = 0.9701



In [45]:
display_topic_distribution_statistics(filtered_docs, filtered_predict_df, 'crispr', 10)

Looking at the top 10 topics that documents are classified under
keyword: crispr
number of unique documents with 'crispr': 246
topic_id: 139
topic_prob: 0.04623544156758605
topic: 0.263*"dna" + 0.107*"genome" + 0.090*"epigenetic" + 0.046*"crispr" + 0.043*"repair" + 0.030*"editing" + 0.029*"histone" + 0.025*"cas9" + 0.023*"genomic" + 0.018*"damage"

Number of unique documents classified under topic: 701
True Positive: 144 / 246 = 0.5854
False Negative: 102 / 246 = 0.4146
False Positive: 557 / 12840 = 0.0434
True Negative: 12283 / 12840 = 0.9566



In [46]:
display_topic_distribution_statistics(filtered_docs, filtered_predict_df, 'crispr', 3)

Looking at the top 3 topics that documents are classified under
keyword: crispr
number of unique documents with 'crispr': 246
topic_id: 139
topic_prob: 0.04623544156758605
topic: 0.263*"dna" + 0.107*"genome" + 0.090*"epigenetic" + 0.046*"crispr" + 0.043*"repair" + 0.030*"editing" + 0.029*"histone" + 0.025*"cas9" + 0.023*"genomic" + 0.018*"damage"

Number of unique documents classified under topic: 362
True Positive: 72 / 246 = 0.2927
False Negative: 174 / 246 = 0.7073
False Positive: 290 / 12840 = 0.0226
True Negative: 12550 / 12840 = 0.9774



In [49]:
display_topic_distribution_statistics(filtered_docs, filtered_predict_df, 'cas9', 5)

Looking at the top 5 topics that documents are classified under
keyword: cas9
number of unique documents with 'cas9': 158
topic_id: 139
topic_prob: 0.024471102910912392
topic: 0.263*"dna" + 0.107*"genome" + 0.090*"epigenetic" + 0.046*"crispr" + 0.043*"repair" + 0.030*"editing" + 0.029*"histone" + 0.025*"cas9" + 0.023*"genomic" + 0.018*"damage"

Number of unique documents classified under topic: 487
True Positive: 73 / 158 = 0.462
False Negative: 85 / 158 = 0.538
False Positive: 414 / 12928 = 0.032
True Negative: 12514 / 12928 = 0.968



In [58]:
display_topic_distribution_statistics(filtered_docs, filtered_predict_df, 'cas9', 3)

Looking at the top 3 topics that documents are classified under
keyword: cas9
number of unique documents with 'cas9': 158
topic_id: 139
topic_prob: 0.024471102910912392
topic: 0.263*"dna" + 0.107*"genome" + 0.090*"epigenetic" + 0.046*"crispr" + 0.043*"repair" + 0.030*"editing" + 0.029*"histone" + 0.025*"cas9" + 0.023*"genomic" + 0.018*"damage"

Number of unique documents classified under topic: 362
True Positive: 51 / 158 = 0.3228
False Negative: 107 / 158 = 0.6772
False Positive: 311 / 12928 = 0.0241
True Negative: 12617 / 12928 = 0.9759



In [48]:
display_topic_distribution_statistics(filtered_docs, filtered_predict_df, 'rna', 5)

Looking at the top 5 topics that documents are classified under
keyword: rna
number of unique documents with 'rna': 580
topic_id: 122
topic_prob: 0.23486118551768878
topic: 0.235*"rna" + 0.061*"mrna" + 0.061*"sequence" + 0.054*"coding" + 0.046*"nmr" + 0.023*"sequencing" + 0.020*"splicing" + 0.017*"motif" + 0.014*"seq" + 0.013*"binding"

Number of unique documents classified under topic: 326
True Positive: 229 / 580 = 0.3948
False Negative: 351 / 580 = 0.6052
False Positive: 97 / 12506 = 0.0078
True Negative: 12409 / 12506 = 0.9922



### Examining False Positives

#### `cas9`

In [115]:
false_positives = display_topic_distribution_statistics(filtered_docs, filtered_predict_df, 'cas9', 3, return_fp=True)

Looking at the top 3 topics that documents are classified under
keyword: cas9
number of unique documents with 'cas9': 158
topic_id: 139
topic_prob: 0.024471102910912392
topic: 0.263*"dna" + 0.107*"genome" + 0.090*"epigenetic" + 0.046*"crispr" + 0.043*"repair" + 0.030*"editing" + 0.029*"histone" + 0.025*"cas9" + 0.023*"genomic" + 0.018*"damage"

Number of unique documents classified under topic: 362
True Positive: 51 / 158 = 0.3228
False Negative: 107 / 158 = 0.6772
False Positive: 311 / 12928 = 0.0241
True Negative: 12617 / 12928 = 0.9759



In [116]:
len(false_positives)

311

In [118]:
random_id = sample(false_positives, 1)[0]
print('random_id', random_id)

print('Is keyword in document:', 'cas9' in filtered_docs[random_id])
print()

# print(' '.join(filtered_docs[random_id]))
filtered_data_df_with_predict.iloc[[random_id]].T

random_id 2185
Is keyword in document: False



Unnamed: 0,5207
rcn,222128
id,839440
acronym,EPIOBESITY
status,TERMINATED
programme,H2020-EU.1.3.2.
topics,MSCA-IF-2018
frameworkProgramme,H2020
title,Unravelling the hypothalamic epigenetic code behind obesity.
startDate,2020-01-01
endDate,2021-12-31


#### `crispr`

In [106]:
false_positives = display_topic_distribution_statistics(filtered_docs, filtered_predict_df, 'crispr', 3, return_fp=True)

Looking at the top 3 topics that documents are classified under
keyword: crispr
number of unique documents with 'crispr': 246
topic_id: 139
topic_prob: 0.04623544156758605
topic: 0.263*"dna" + 0.107*"genome" + 0.090*"epigenetic" + 0.046*"crispr" + 0.043*"repair" + 0.030*"editing" + 0.029*"histone" + 0.025*"cas9" + 0.023*"genomic" + 0.018*"damage"

Number of unique documents classified under topic: 362
True Positive: 72 / 246 = 0.2927
False Negative: 174 / 246 = 0.7073
False Positive: 290 / 12840 = 0.0226
True Negative: 12550 / 12840 = 0.9774



In [107]:
len(false_positives)

290

In [108]:
random_id = sample(false_positives, 1)[0]
print('random_id', random_id)

print('Is keyword in document:', 'cas9' in filtered_docs[random_id])
print()

# print(' '.join(filtered_docs[random_id]))
filtered_data_df_with_predict.iloc[[random_id]].T

random_id 2742
Is keyword in document: False



Unnamed: 0,6578
rcn,206324
id,715975
acronym,DPC_REPAIR
status,SIGNED
programme,H2020-EU.1.1.
topics,ERC-2016-STG
frameworkProgramme,H2020
title,Mechanism of DNA-protein cross-link repair in S phase
startDate,2017-01-01
endDate,2021-12-31


#### `rna`

In [119]:
false_positives = display_topic_distribution_statistics(filtered_docs, filtered_predict_df, 'rna', 3, return_fp=True)

Looking at the top 3 topics that documents are classified under
keyword: rna
number of unique documents with 'rna': 580
topic_id: 122
topic_prob: 0.23486118551768878
topic: 0.235*"rna" + 0.061*"mrna" + 0.061*"sequence" + 0.054*"coding" + 0.046*"nmr" + 0.023*"sequencing" + 0.020*"splicing" + 0.017*"motif" + 0.014*"seq" + 0.013*"binding"

Number of unique documents classified under topic: 238
True Positive: 187 / 580 = 0.3224
False Negative: 393 / 580 = 0.6776
False Positive: 51 / 12506 = 0.0041
True Negative: 12455 / 12506 = 0.9959



In [120]:
len(false_positives)

51

In [121]:
random_id = sample(false_positives, 1)[0]
print('random_id', random_id)

print('Is keyword in document:', 'cas9' in filtered_docs[random_id])
print()

# print(' '.join(filtered_docs[random_id]))
filtered_data_df_with_predict.iloc[[random_id]].T

random_id 4434
Is keyword in document: False



Unnamed: 0,10496
rcn,201074
id,702726
acronym,Expectancy
status,SIGNED
programme,H2020-EU.1.3.2.
topics,MSCA-IF-2015-EF
frameworkProgramme,H2020
title,Unveiling expectancy neuronal coding in the cerebral cortex induced by naturalistic tactile stimuli
startDate,2017-03-01
endDate,2019-02-28
