In [1]:
!ls ../data

'Associated words.xlsx'     'Text mining word list test 200823.xlsx'
 cordis-h2020projects.xlsx   topics_300_SYinput_LW.csv


In [2]:
import sys
sys.path.append("../")

%load_ext autoreload
%autoreload 2

## Imports

In [3]:
import pandas as pd
from random import sample, shuffle
import pickle

from gensim.models.wrappers import ldamallet

import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
from src.artefacts_helper import load_mallet_model
from src.gensim_helper import create_dictionary, get_coherence
from src.artefacts_helper import save_model
from src.process_data import process_data
from src.train import train_lda_mallet
from src.predict import get_all_topics

[nltk_data] Downloading package wordnet to /home/kohkb/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/kohkb/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Import Data

In [5]:
sheets_dict = pd.read_excel('../data/cordis-h2020projects.xlsx', None)
df = sheets_dict['cordis-h2020projects']

df.shape

(30084, 21)

In [6]:
# combine title and objective
data = (df['title'] + ' ' + df['objective']).values.tolist()
docs = process_data(data)
dictionary = create_dictionary(docs)
corpus = [dictionary.doc2bow(doc) for doc in docs]

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 22552
Number of documents: 30084


In [7]:
labelled_topics_df = pd.read_csv('../data/topics_300_SYinput_LW.csv')
labelled_topics_df.head()

Unnamed: 0,Topic_Id,Topic_Keywords,Num_Documents,Perc_Documents,Relevance (SY),LW comments
0,0,"political, study, social, politics, discourse,...",249.0,0.0083,No,
1,1,"disease, inflammatory, inflammation, therapeut...",122.0,0.0041,Yes,
2,2,"delivery, release, deliver, based, develop, oa...",55.0,0.0018,Yes,
3,3,"emission, fuel, gas, co2, carbon, reduction, c...",211.0,0.007,No,
4,4,"liver, development, sport, aim, major, event, ...",47.0,0.0016,Yes,


## Preprocessing

In [8]:
labelled_topics_df['Relevance (SY)'].value_counts()

No     197
Yes     99
NO       2
no       1
yes      1
Name: Relevance (SY), dtype: int64

In [9]:
labelled_topics_df['Relevance (SY)'] = [i.strip().lower() == 'yes' for i in labelled_topics_df['Relevance (SY)']]
labelled_topics_df['Relevance (SY)'].value_counts()

False    200
True     100
Name: Relevance (SY), dtype: int64

In [10]:
relevant_topics = set(labelled_topics_df[labelled_topics_df['Relevance (SY)']]['Topic_Id'].values.tolist())
list(relevant_topics)[:5]

[1, 2, 4, 5, 7]

## Load Baseline Model

In [11]:
model = load_mallet_model(artefacts_path='../artefacts', suffix='300_topics_mallet_alpha_50')
model.mallet_path = '../mallet-2.0.8/bin/mallet'
model.prefix = '../artefacts/mallet_tmp/'

In [16]:
%%time
# for reproducibility: 0.5301726081250121
get_coherence(model, docs, dictionary)

CPU times: user 1min 3s, sys: 3.53 s, total: 1min 7s
Wall time: 9min 43s


0.5301726081250121

## Filtering Documents

In [12]:
predictions = model[corpus]

__If a document does not have any relevant topics in its top n topics, remove it__

In [13]:
print('original corpus size:', len(corpus))
filtered_doc_id = []
top_topics = 5

for idx, i in enumerate(predictions):
    top_n_topics = set([j[0] for j in sorted(i, key=lambda x: x[1], reverse=True)[:top_topics]])
    if top_n_topics.intersection(relevant_topics) != set():
        filtered_doc_id.append(idx)

print('filtered corpus size:', len(filtered_doc_id))

original corpus size: 30084
filtered corpus size: 23029


In [14]:
print('original corpus size:', len(corpus))
filtered_doc_id = []
top_topics = 3

for idx, i in enumerate(predictions):
    top_n_topics = set([j[0] for j in sorted(i, key=lambda x: x[1], reverse=True)[:top_topics]])
    if top_n_topics.intersection(relevant_topics) != set():
        filtered_doc_id.append(idx)

print('filtered corpus size:', len(filtered_doc_id))

original corpus size: 30084
filtered corpus size: 19702


In [15]:
print('original corpus size:', len(corpus))
filtered_doc_id = []
top_topics = 1

for idx, i in enumerate(predictions):
    top_n_topics = set([j[0] for j in sorted(i, key=lambda x: x[1], reverse=True)[:top_topics]])
    if top_n_topics.intersection(relevant_topics) != set():
        filtered_doc_id.append(idx)

print('filtered corpus size:', len(filtered_doc_id))

original corpus size: 30084
filtered corpus size: 13086


__Filtering by threshold__

In [44]:
print('original corpus size:', len(corpus))
filtered_doc_id = []
top_topics = 1
prob_threshold = 0.1

for idx, i in enumerate(predictions):
    top_n_topics = set([j[0] for j in sorted(i, key=lambda x: x[1], reverse=True)[:top_topics] if j[1] >= prob_threshold])
    if top_n_topics.intersection(relevant_topics) != set():
        filtered_doc_id.append(idx)

print('filtered corpus size:', len(filtered_doc_id))

original corpus size: 30084
filtered corpus size: 10572


Since using `top_topics` = 1 since to work well, current filtering method will be that

## Validation

### Sampling Filtered Corpus

In [20]:
# pd.DataFrame([(i, data[i]) for i in sample(filtered_doc_id, 100)], columns=['doc_id', 'data']).to_csv('../output/temp.csv')

### Eyeballing some keywords¶

In [16]:
def display_keyword_stats(docs, filtered_doc_id, keyword):
    print(f'keyword: {keyword}')
    keyword_ids = set([idx for idx, i in enumerate(docs) if keyword in i])
    print(f'number of unique documents with \'{keyword}\': {len(keyword_ids)}')
    num_intersection = len(set(filtered_doc_id).intersection(keyword_ids))
    print(f'number of intersection with filtered corpus: {num_intersection} = {round(num_intersection/len(keyword_ids), 4)}')

In [17]:
display_keyword_stats(docs, filtered_doc_id, 'crispr')

keyword: crispr
number of unique documents with 'crispr': 258
number of intersection with filtered corpus: 246 = 0.9535


In [18]:
display_keyword_stats(docs, filtered_doc_id, 'cas9')

keyword: cas9
number of unique documents with 'cas9': 164
number of intersection with filtered corpus: 158 = 0.9634


In [50]:
display_keyword_stats(docs, filtered_doc_id, 'rna')

keyword: rna
number of unique documents with 'rna': 610
number of intersection with filtered corpus: 580 = 0.9508


In [51]:
display_keyword_stats(docs, filtered_doc_id, 'mrna')

keyword: mrna
number of unique documents with 'mrna': 168
number of intersection with filtered corpus: 164 = 0.9762


In [52]:
display_keyword_stats(docs, filtered_doc_id, 'cancer')

keyword: cancer
number of unique documents with 'cancer': 1831
number of intersection with filtered corpus: 1658 = 0.9055


In [53]:
display_keyword_stats(docs, filtered_doc_id, 'vaccine')

keyword: vaccine
number of unique documents with 'vaccine': 290
number of intersection with filtered corpus: 257 = 0.8862


In [19]:
display_keyword_stats(docs, filtered_doc_id, 'alzheimer')

keyword: alzheimer
number of unique documents with 'alzheimer': 236
number of intersection with filtered corpus: 213 = 0.9025


In [54]:
display_keyword_stats(docs, filtered_doc_id, 'insulin')

keyword: insulin
number of unique documents with 'insulin': 116
number of intersection with filtered corpus: 111 = 0.9569


In [55]:
display_keyword_stats(docs, filtered_doc_id, 'diabetes')

keyword: diabetes
number of unique documents with 'diabetes': 391
number of intersection with filtered corpus: 355 = 0.9079


In [56]:
display_keyword_stats(docs, filtered_doc_id, 'gene')

keyword: gene
number of unique documents with 'gene': 1633
number of intersection with filtered corpus: 1520 = 0.9308


In [57]:
display_keyword_stats(docs, filtered_doc_id, 'bacteria')

keyword: bacteria
number of unique documents with 'bacteria': 643
number of intersection with filtered corpus: 508 = 0.79


In [58]:
display_keyword_stats(docs, filtered_doc_id, 'leukemia')

keyword: leukemia
number of unique documents with 'leukemia': 84
number of intersection with filtered corpus: 79 = 0.9405


In [59]:
display_keyword_stats(docs, filtered_doc_id, 'immunotherapy')

keyword: immunotherapy
number of unique documents with 'immunotherapy': 208
number of intersection with filtered corpus: 198 = 0.9519


## Save Artefacts

In [60]:
filtered_data = [i for idx, i in enumerate(data) if idx in filtered_doc_id]
filtered_docs = [i for idx, i in enumerate(docs) if idx in filtered_doc_id]
len(filtered_docs)

13086

In [39]:
filtered_dict = create_dictionary(filtered_docs)

In [40]:
# Bag-of-words representation of the documents.
filtered_corpus = [filtered_dict.doc2bow(doc) for doc in filtered_docs]

In [63]:
# other artefacts
filtered_dict.save('../artefacts/filtered_top_1_dictionary')

with open('../artefacts/filtered_top_1_docs.pkl', 'wb') as f:
    pickle.dump(filtered_docs, f)

with open('../artefacts/filtered_top_doc_id.pkl', 'wb') as f:
    pickle.dump(filtered_doc_id, f)

## Train

In [64]:
%%time 

filtered_model = train_lda_mallet(filtered_corpus, filtered_dict, 300, 
                         params={
                             'mallet_path': '../mallet-2.0.8/bin/mallet',
                             'prefix_path': '../artefacts/mallet_tmp/',
                             'prefix': 'filtered_top_1'
                         })

CPU times: user 7.45 s, sys: 100 ms, total: 7.55 s
Wall time: 1min 7s


In [66]:
get_all_topics(filtered_model).head()

Unnamed: 0,Topic_Id,Topic_Keywords
0,0,"circuit, brain, neuron, cortical, cortex, acti..."
1,1,"child, adult, young, parent, study, childhood,..."
2,2,"process, multiple, stability, provide, stable,..."
3,3,"paradigm, bio, shift, goal, develop, breakthro..."
4,4,"function, regulation, homeostasis, role, regul..."


In [68]:
# check for reproducibility, should be 0.505769846580301
get_coherence(filtered_model, filtered_docs, filtered_dict)

0.505769846580301

In [72]:
save_model(filtered_model, suffix='filtered_top_1')

model saved at ../artefacts/model_filtered_top_1
