In [1]:
!ls ../data

 cordis-h2020projects.xlsx  'Text mining word list test 200823.xlsx'


In [2]:
import sys
sys.path.append("../")

%load_ext autoreload
%autoreload 2

## Imports

In [65]:
import pandas as pd
from random import sample, shuffle
import pickle

from gensim.models.wrappers import ldamallet

import matplotlib.pyplot as plt
%matplotlib inline

In [36]:
from src.gensim_helper import create_dictionary, get_coherence
from src.train import train_lda_single_core, train_lda_multi_core, train_lda_mallet
from src.process_data import process_data
from src.visualize import generate_ldavis
from src.artefacts_helper import save_model, load_model, load_mallet_model
from src.predict import predict_and_format_topics, get_topic_most_dominant_document, get_topics_distribution
from src.predict import get_term_topics, format_term_search_results, get_all_topics

## Import data

In [42]:
sheets_dict = pd.read_excel('../data/cordis-h2020projects.xlsx', None)
df = sheets_dict['cordis-h2020projects']

In [43]:
df.shape

(30084, 21)

In [44]:
df.head()

Unnamed: 0,rcn,id,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,objective,totalCost,ecMaxContribution,call,fundingScheme,coordinator,coordinatorCountry,participants,participantCountries,subjects
0,207037,734211,INTERACT,SIGNED,H2020-EU.1.3.3.,MSCA-RISE-2016,H2020,The INTERnAtional network on Crisis Translation,2017-04-01,2020-03-31,...,"We propose to establish an interdisciplinary, ...",279000.0,229500.0,H2020-MSCA-RISE-2016,MSCA-RISE,DUBLIN CITY UNIVERSITY,IE,THE COCHRANE COLLABORATION;UNIVERSITY COLLEGE ...,UK;PT,
1,199028,686987,BrainHack,SIGNED,H2020-EU.1.2.1.,FETOPEN-CSA-FETEXCHANGE-2015,H2020,BrainHack: Bringing the arts and sciences of b...,2016-01-01,2017-12-31,...,We witness a rapid development of Brain/Neural...,567352.5,549727.0,H2020-FETOPEN-2015-CSA,CSA,TECHNISCHE UNIVERSITEIT DELFT,NL,"THE PROVOST, FELLOWS, FOUNDATION SCHOLARS & TH...",IE;PT;CZ;NL;IT;EE,
2,207221,733174,IMPACT TB,SIGNED,H2020-EU.3.1.6.,SC1-PM-21-2016,H2020,IMPACT TB: Implementing proven community-based...,2017-01-01,2019-12-31,...,The aim of this project is to assess the facil...,4912423.75,4912423.75,H2020-SC1-2016-RTD,RIA,LIVERPOOL SCHOOL OF TROPICAL MEDICINE,UK,KONINKLIJKE NEDERLANDSE CENTRALE VERENIGING TO...,NL;NP;SE;DE,
3,207786,700512,CortIMod,CLOSED,H2020-EU.1.3.2.,MSCA-IF-2015-EF,H2020,Implementation and Preliminary Validation of a...,2016-11-01,2018-11-27,...,Stroke is a leading cause of adult chronic dis...,195454.8,195454.8,H2020-MSCA-IF-2015,MSCA-IF-EF-ST,UNIVERSITY COLLEGE LONDON,UK,,,
4,198320,676144,SyDAD,SIGNED,H2020-EU.1.3.1.,MSCA-ITN-2015-ETN,H2020,Synaptic Dysfunction in Alzheimer Disease,2015-11-01,2019-10-31,...,Given an overwhelming increase of dementia cos...,3846736.44,3846736.44,H2020-MSCA-ITN-2015,MSCA-ITN-ETN,KAROLINSKA INSTITUTET,SE,AXON NEUROSCIENCE SE;DEUTSCHES ZENTRUM FUR NEU...,SK;DE;FR;IT;BE,


We are only interested in the `title` and `objective` fields for topic modelling

In [8]:
# combine title and objective
data = (df['title'] + ' ' + df['objective']).values.tolist()

In [9]:
len(data)

30084

In [10]:
data[0]

'The INTERnAtional network on Crisis Translation We propose to establish an interdisciplinary, intersectoral and international research and innovation network in Crisis Translation, called INTERACT. Crisis Translation is understood here as the translation of written information from one linguistic and cultural system to another in the context of a crisis scenario, with a view to enabling affected communities and responders to be prepared for crises, improve resilience and reduce the loss of lives. Due to the transboundary nature of modern day crises, crisis communication must be multilingual and multilingual crisis communication is enabled through translation. Multilingual information access through translation addresses work programme aims such as social fairness and democratic access to essential information for all. The primary focus of INTERACT is on health-related crisis content. The main objectives of the project are 1) to make meaningful and effective contributions to knowledge,

## Processing

In [11]:
docs = process_data(data)

In [12]:
docs[0][:5]

['international', 'network', 'crisis', 'translation', 'propose']

In [13]:
dictionary = create_dictionary(docs)

In [14]:
dictionary.doc2idx(docs[0])[:5]

[55, 73, 23, 106, 86]

In [15]:
# showing how to translate back from id to doc
[dictionary[i] for i in dictionary.doc2idx(docs[0])[:5]]

['international', 'network', 'crisis', 'translation', 'propose']

In [16]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [17]:
corpus[0][:5]

[(0, 2), (1, 2), (2, 1), (3, 1), (4, 2)]

In [18]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 22552
Number of documents: 30084


## Training

In [24]:
%%time 

model = train_lda_single_core(corpus, dictionary, 10, params={})

CPU times: user 6min, sys: 39.6 ms, total: 6min
Wall time: 6min


In [26]:
get_coherence(model, docs, dictionary)

0.5068668673180966

In [19]:
%%time 
model = train_lda_multi_core(corpus, dictionary, 10, params={})

CPU times: user 3min 48s, sys: 48 s, total: 4min 36s
Wall time: 3min 51s


In [20]:
get_coherence(model, docs, dictionary)

0.47056989952123873

## pyLDAvis Visualization

In [21]:
generate_ldavis(model, corpus, dictionary, '10_topics')

vis generated to ../references/10_topics.html


## Save Model and Dictionary

In [39]:
save_model(model, suffix='200_topics')

In [73]:
# other artefacts
dictionary.save('../artefacts/dictionary')

with open('../artefacts/corpus.pkl', 'wb') as f:
    pickle.dump(corpus, f)
    
with open('../artefacts/docs.pkl', 'wb') as f:
    pickle.dump(docs, f)

## Load 

In [24]:
lda_model = load_model(suffix='200_topics')

In [25]:
model = load_mallet_model(artefacts_path='../artefacts', suffix='300_topics_mallet_alpha_50')

In [26]:
model.mallet_path = '../mallet-2.0.8/bin/mallet'

## Predict and Format

In [27]:
# prediction of 1 document and sorting the topics by their distribution
sorted(model[corpus[0]], key=lambda x: x[1], reverse=True)[:10]

[(280, 0.13007478632478592),
 (139, 0.09054487179487151),
 (15, 0.065438034188034),
 (8, 0.03018162393162383),
 (19, 0.027510683760683673),
 (66, 0.026976495726495638),
 (39, 0.022702991452991383),
 (63, 0.014155982905982864),
 (247, 0.013087606837606798),
 (299, 0.010950854700854669)]

In [41]:
predict_df = predict_and_format_topics(ldamodel=model, corpus=corpus, texts=data)

In [45]:
df.loc[:,'Topic_Id'] = predict_df['Dominant_Topic']
df.loc[:,'Topic_Prob'] = predict_df['Topic_Prob']
df.loc[:,'Topic_Keywords'] = predict_df['Topic_Keywords']

In [46]:
df.head()

Unnamed: 0,rcn,id,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,call,fundingScheme,coordinator,coordinatorCountry,participants,participantCountries,subjects,Topic_Id,Topic_Prob,Topic_Keywords
0,207037,734211,INTERACT,SIGNED,H2020-EU.1.3.3.,MSCA-RISE-2016,H2020,The INTERnAtional network on Crisis Translation,2017-04-01,2020-03-31,...,H2020-MSCA-RISE-2016,MSCA-RISE,DUBLIN CITY UNIVERSITY,IE,THE COCHRANE COLLABORATION;UNIVERSITY COLLEGE ...,UK;PT,,280.0,0.1301,"language, speech, linguistic, word, natural, m..."
1,199028,686987,BrainHack,SIGNED,H2020-EU.1.2.1.,FETOPEN-CSA-FETEXCHANGE-2015,H2020,BrainHack: Bringing the arts and sciences of b...,2016-01-01,2017-12-31,...,H2020-FETOPEN-2015-CSA,CSA,TECHNISCHE UNIVERSITEIT DELFT,NL,"THE PROVOST, FELLOWS, FOUNDATION SCHOLARS & TH...",IE;PT;CZ;NL;IT;EE,,183.0,0.1108,"science, research, scientific, conference, sci..."
2,207221,733174,IMPACT TB,SIGNED,H2020-EU.3.1.6.,SC1-PM-21-2016,H2020,IMPACT TB: Implementing proven community-based...,2017-01-01,2019-12-31,...,H2020-SC1-2016-RTD,RIA,LIVERPOOL SCHOOL OF TROPICAL MEDICINE,UK,KONINKLIJKE NEDERLANDSE CENTRALE VERENIGING TO...,NL;NP;SE;DE,,63.0,0.1312,"health, intervention, care, mental, evidence, ..."
3,207786,700512,CortIMod,CLOSED,H2020-EU.1.3.2.,MSCA-IF-2015-EF,H2020,Implementation and Preliminary Validation of a...,2016-11-01,2018-11-27,...,H2020-MSCA-IF-2015,MSCA-IF-EF-ST,UNIVERSITY COLLEGE LONDON,UK,,,,150.0,0.0991,"brain, cognitive, disorder, neural, neuroscien..."
4,198320,676144,SyDAD,SIGNED,H2020-EU.1.3.1.,MSCA-ITN-2015-ETN,H2020,Synaptic Dysfunction in Alzheimer Disease,2015-11-01,2019-10-31,...,H2020-MSCA-ITN-2015,MSCA-ITN-ETN,KAROLINSKA INSTITUTET,SE,AXON NEUROSCIENCE SE;DEUTSCHES ZENTRUM FUR NEU...,SK;DE;FR;IT;BE,,15.0,0.1909,"training, research, academic, researcher, esr,..."


In [47]:
# df.to_csv('../output/cordis-h2020projects_with_topics.csv', index=False)

In [48]:
get_topic_most_dominant_document(predict_df).head()

Unnamed: 0,Topic_Num,Document_No,Topic_Prob,Topic_Keywords,Text
0,0.0,30013,0.4259,"political, study, social, politics, discourse,...","Protest and Order. Democratic theory, contenti..."
1,1.0,6696,0.3239,"disease, inflammatory, inflammation, therapeut...",Mechanisms and regulation of inflammasome-asso...
2,2.0,7298,0.2369,"delivery, release, deliver, based, develop, oa...",Biocompatible nanoparticles for T cell targete...
3,3.0,21361,0.3374,"emission, fuel, gas, co2, carbon, reduction, c...",Heavy Duty Gas Engines integrated into Vehicle...
4,4.0,17184,0.328,"liver, development, sport, aim, major, event, ...",EXercise as a regulator of hepatic NAD metabol...


In [49]:
get_topics_distribution(predict_df).head()

Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0,0.0,"political, study, social, politics, discourse,...",249,0.0083
1,1.0,"disease, inflammatory, inflammation, therapeut...",122,0.0041
2,2.0,"delivery, release, deliver, based, develop, oa...",55,0.0018
3,3.0,"emission, fuel, gas, co2, carbon, reduction, c...",211,0.007
4,4.0,"liver, development, sport, aim, major, event, ...",47,0.0016


In [50]:
_ = get_topics_distribution(predict_df)['Dominant_Topic'].values.tolist()

print([i for i in range(300) if i not in _])

[13, 65, 135, 158]


In [51]:
get_all_topics(model).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
Topic_Id,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
Topic_Keywords,"political, study, social, politics, discourse,...","disease, inflammatory, inflammation, therapeut...","delivery, release, deliver, based, develop, oa...","emission, fuel, gas, co2, carbon, reduction, c...","liver, development, sport, aim, major, event, ...","movement, home, people, rehabilitation, life, ...","chain, supply, business, market, sector, deman...","receptor, signalling, signaling, role, signal,...","public, citizen, community, engagement, privat...","ocean, marine, sea, ice, coastal, atlantic, sh...",...,"approach, diversity, understanding, diverse, r...","skin, pain, regeneration, wound, healing, inju...","oil, friendly, eco, industry, innovative, envi...","strategy, needed, target, anti, effective, ide...","data, database, information, web, tool, open, ...","primary, mining, secondary, rare, critical, ex...","magnetic, field, spin, magnet, resonance, elec...","virus, vaccine, disease, infection, viral, vec...","light, led, shed, lighting, colour, display, o...","research, science, philosophy, practice, relat..."


In [52]:
topics_df = get_all_topics(model)
dist_df = get_topics_distribution(predict_df)[['Dominant_Topic', 'Num_Documents', 'Perc_Documents']]
topics_df = topics_df.merge(dist_df, how='left', left_on='Topic_Id', right_on='Dominant_Topic').drop(columns='Dominant_Topic')

In [53]:
topics_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
Topic_Id,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
Topic_Keywords,"political, study, social, politics, discourse,...","disease, inflammatory, inflammation, therapeut...","delivery, release, deliver, based, develop, oa...","emission, fuel, gas, co2, carbon, reduction, c...","liver, development, sport, aim, major, event, ...","movement, home, people, rehabilitation, life, ...","chain, supply, business, market, sector, deman...","receptor, signalling, signaling, role, signal,...","public, citizen, community, engagement, privat...","ocean, marine, sea, ice, coastal, atlantic, sh...",...,"approach, diversity, understanding, diverse, r...","skin, pain, regeneration, wound, healing, inju...","oil, friendly, eco, industry, innovative, envi...","strategy, needed, target, anti, effective, ide...","data, database, information, web, tool, open, ...","primary, mining, secondary, rare, critical, ex...","magnetic, field, spin, magnet, resonance, elec...","virus, vaccine, disease, infection, viral, vec...","light, led, shed, lighting, colour, display, o...","research, science, philosophy, practice, relat..."
Num_Documents,249,122,55,211,47,139,58,126,103,151,...,6,98,42,22,88,55,138,226,76,100
Perc_Documents,0.0083,0.0041,0.0018,0.007,0.0016,0.0046,0.0019,0.0042,0.0034,0.005,...,0.0002,0.0033,0.0014,0.0007,0.0029,0.0018,0.0046,0.0075,0.0025,0.0033


In [166]:
# topics_df.to_csv('../output/topics_300.csv', index=False)

## Search Terms

In [56]:
sheets_dict = pd.read_excel('../data/Text mining word list test 200823.xlsx', None, header=None)
search_terms = sheets_dict['Sheet2']
search_terms = search_terms.values.flatten().tolist()

In [58]:
len(search_terms)

632

In [59]:
print(sample(search_terms, 10))

['gene regulatory', 'microbial', 'newborns', 'endonuclease', 'lymphoblastic leukemia', 'hemorrhage', 'mast cell', 'cardiac', 'autoimmune', 'medical device']


In [60]:
search_docs = process_data(search_terms)
search_docs[:5]

[['3d'], ['3d', 'genome'], ['3d', 'bioprinting'], ['acne'], ['acne', 'skin']]

In [61]:
# use list of vocabs as input to a dictionary
search_dict = create_dictionary(search_docs, filter_extreme=False)

In [62]:
# examining intersection between dictionary and search_dictionary
# terms in search_dict that are not in dictionary
set(search_dict.values()) - set(dictionary.values()).intersection(set(search_dict.values()))

{'adipogenesis',
 'amnesic',
 'analgesia',
 'autoinflammation',
 'colo',
 'exonuclease',
 'flagellin',
 'funtional',
 'gluconeogenesis',
 'jaundice',
 'mait',
 'micropeptides',
 'multiomics',
 'muscoloskeletal',
 'myeloablation',
 'neuromotor',
 'nutrigenomics',
 'postpartum',
 'psychomotor',
 'rheumatic'}

In [63]:
len(set(search_dict.values()) - set(dictionary.values()).intersection(set(search_dict.values())))

20

In [64]:
dictionary.doc2idx(['amnesic'])

[-1]

### Examining topics extracted from searching words in dictionary

In [66]:
# examining topics extracted from searching words in dictionary
search_term_predictions = {}
temp_model = ldamallet.malletmodel2ldamodel(model)
for i in search_dict.values():
    results = get_term_topics(temp_model, dictionary, i)
    if results:    # also removes empty results
        search_term_predictions[i] = results

In [67]:
len(search_term_predictions)

269

In [68]:
# 269 out of 569 search tokens (569 - 20 tokens in search_dict that are not in dictionary) return a topic from the model
len(search_term_predictions) / len(search_dict)

0.4727592267135325

__Get topics and key words__

In [69]:
search_terms_topic_df = format_term_search_results(model, search_term_predictions)
search_terms_topic_df.head(10)

Unnamed: 0,Search_Term,Topic_ID,Topic_Prob,Topic_Keywords
0,3d,44,0.257112,"3d, printing, technology, printed, based, ink,..."
1,genome,54,0.052382,"gene, genetic, genome, expression, mutation, m..."
2,genome,103,0.030095,"dna, epigenetic, chromatin, replication, genom..."
3,skin,291,0.063733,"skin, pain, regeneration, wound, healing, inju..."
4,acoustic,226,0.051384,"sound, music, acoustic, performance, creative,..."
5,biomaterials,111,0.015502,"bone, implant, joint, tissue, cartilage, bioma..."
6,active,101,0.200965,"active, based, aim, passive, develop, proposed..."
7,ageing,266,0.066623,"life, ageing, people, age, population, aging, ..."
8,adverse,86,0.010158,"effect, impact, side, study, affect, outcome, ..."
9,effect,86,0.305103,"effect, impact, side, study, affect, outcome, ..."


In [165]:
search_terms_topic_df.to_csv('../output/search_terms_to_topic.csv', index=False)

__More important keywords highlighted by Wenyu__

In [70]:
search_dict.token2id['crispr']

121

In [71]:
search_dict.token2id['rna']

125

In [72]:
search_terms_topic_df[search_terms_topic_df['Search_Term'] == 'crispr']

Unnamed: 0,Search_Term,Topic_ID,Topic_Prob,Topic_Keywords
129,crispr,54,0.017155,"gene, genetic, genome, expression, mutation, m..."


In [73]:
search_terms_topic_df[search_terms_topic_df['Search_Term'] == 'rna']

Unnamed: 0,Search_Term,Topic_ID,Topic_Prob,Topic_Keywords
133,rna,189,0.077636,"protein, rna, mrna, function, gene, expression..."


In [74]:
model.show_topic(189)

[('protein', 0.1646944316377235),
 ('rna', 0.07765995594487987),
 ('mrna', 0.02110547615388556),
 ('function', 0.021054249270016905),
 ('gene', 0.01946621587008862),
 ('expression', 0.019005173915270734),
 ('modification', 0.01849290507658419),
 ('cell', 0.01654628348957533),
 ('translation', 0.016034014650888787),
 ('cellular', 0.014343527483223196)]

In [76]:
model.show_topic(54)

[('gene', 0.12105997210599721),
 ('genetic', 0.06606893803546524),
 ('genome', 0.05240087666865909),
 ('expression', 0.036182506475393506),
 ('mutation', 0.02976688583383144),
 ('molecular', 0.02327156804144252),
 ('genomic', 0.020800956365809923),
 ('crispr', 0.017174736003187887),
 ('identify', 0.01701534170153417),
 ('sequencing', 0.016497310221159595)]