In [2]:
!ls ../data

'Associated words.xlsx'     'Text mining word list test 200823.xlsx'
 cordis-h2020projects.xlsx   topics_300_SYinput_LW.csv


In [3]:
import sys
sys.path.append("../")

%load_ext autoreload
%autoreload 2

## Imports

In [4]:
import pandas as pd
from random import sample, shuffle
import pickle

from gensim.models.wrappers import ldamallet

import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
from src.gensim_helper import create_dictionary, get_coherence
from src.train import train_lda_single_core, train_lda_multi_core, train_lda_mallet
from src.process_data import process_data
from src.visualize import generate_ldavis
from src.artefacts_helper import save_model, load_model, load_mallet_model
from src.predict import predict_and_format_topics, get_topic_most_dominant_document, get_topics_distribution
from src.predict import get_term_topics, format_term_search_results, get_all_topics

[nltk_data] Downloading package wordnet to /home/kohkb/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Import data

In [6]:
sheets_dict = pd.read_excel('../data/cordis-h2020projects.xlsx', None)
df = sheets_dict['cordis-h2020projects']

In [7]:
df.shape

(30084, 21)

In [8]:
df.head()

Unnamed: 0,rcn,id,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,objective,totalCost,ecMaxContribution,call,fundingScheme,coordinator,coordinatorCountry,participants,participantCountries,subjects
0,207037,734211,INTERACT,SIGNED,H2020-EU.1.3.3.,MSCA-RISE-2016,H2020,The INTERnAtional network on Crisis Translation,2017-04-01,2020-03-31,...,"We propose to establish an interdisciplinary, ...",279000.0,229500.0,H2020-MSCA-RISE-2016,MSCA-RISE,DUBLIN CITY UNIVERSITY,IE,THE COCHRANE COLLABORATION;UNIVERSITY COLLEGE ...,UK;PT,
1,199028,686987,BrainHack,SIGNED,H2020-EU.1.2.1.,FETOPEN-CSA-FETEXCHANGE-2015,H2020,BrainHack: Bringing the arts and sciences of b...,2016-01-01,2017-12-31,...,We witness a rapid development of Brain/Neural...,567352.5,549727.0,H2020-FETOPEN-2015-CSA,CSA,TECHNISCHE UNIVERSITEIT DELFT,NL,"THE PROVOST, FELLOWS, FOUNDATION SCHOLARS & TH...",IE;PT;CZ;NL;IT;EE,
2,207221,733174,IMPACT TB,SIGNED,H2020-EU.3.1.6.,SC1-PM-21-2016,H2020,IMPACT TB: Implementing proven community-based...,2017-01-01,2019-12-31,...,The aim of this project is to assess the facil...,4912423.75,4912423.75,H2020-SC1-2016-RTD,RIA,LIVERPOOL SCHOOL OF TROPICAL MEDICINE,UK,KONINKLIJKE NEDERLANDSE CENTRALE VERENIGING TO...,NL;NP;SE;DE,
3,207786,700512,CortIMod,CLOSED,H2020-EU.1.3.2.,MSCA-IF-2015-EF,H2020,Implementation and Preliminary Validation of a...,2016-11-01,2018-11-27,...,Stroke is a leading cause of adult chronic dis...,195454.8,195454.8,H2020-MSCA-IF-2015,MSCA-IF-EF-ST,UNIVERSITY COLLEGE LONDON,UK,,,
4,198320,676144,SyDAD,SIGNED,H2020-EU.1.3.1.,MSCA-ITN-2015-ETN,H2020,Synaptic Dysfunction in Alzheimer Disease,2015-11-01,2019-10-31,...,Given an overwhelming increase of dementia cos...,3846736.44,3846736.44,H2020-MSCA-ITN-2015,MSCA-ITN-ETN,KAROLINSKA INSTITUTET,SE,AXON NEUROSCIENCE SE;DEUTSCHES ZENTRUM FUR NEU...,SK;DE;FR;IT;BE,


We are only interested in the `title` and `objective` fields for topic modelling

In [9]:
# combine title and objective
data = (df['title'] + ' ' + df['objective']).values.tolist()

In [10]:
len(data)

30084

In [11]:
data[0]

'The INTERnAtional network on Crisis Translation We propose to establish an interdisciplinary, intersectoral and international research and innovation network in Crisis Translation, called INTERACT. Crisis Translation is understood here as the translation of written information from one linguistic and cultural system to another in the context of a crisis scenario, with a view to enabling affected communities and responders to be prepared for crises, improve resilience and reduce the loss of lives. Due to the transboundary nature of modern day crises, crisis communication must be multilingual and multilingual crisis communication is enabled through translation. Multilingual information access through translation addresses work programme aims such as social fairness and democratic access to essential information for all. The primary focus of INTERACT is on health-related crisis content. The main objectives of the project are 1) to make meaningful and effective contributions to knowledge,

## Processing

In [22]:
docs = process_data(data)

In [23]:
docs[0][:5]

['international', 'network', 'crisis', 'translation', 'propose']

In [24]:
dictionary = create_dictionary(docs)

In [25]:
dictionary.doc2idx(docs[0])[:5]

[55, 73, 23, 106, 86]

In [26]:
# showing how to translate back from id to doc
[dictionary[i] for i in dictionary.doc2idx(docs[0])[:5]]

['international', 'network', 'crisis', 'translation', 'propose']

In [27]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [28]:
corpus[0][:5]

[(0, 2), (1, 2), (2, 1), (3, 1), (4, 2)]

In [29]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 22552
Number of documents: 30084


## Training

In [24]:
%%time 

model = train_lda_single_core(corpus, dictionary, 10, params={})

CPU times: user 6min, sys: 39.6 ms, total: 6min
Wall time: 6min


In [26]:
get_coherence(model, docs, dictionary)

0.5068668673180966

In [19]:
%%time 
model = train_lda_multi_core(corpus, dictionary, 10, params={})

CPU times: user 3min 48s, sys: 48 s, total: 4min 36s
Wall time: 3min 51s


In [20]:
get_coherence(model, docs, dictionary)

0.47056989952123873

In [24]:
%%time 

model = train_lda_mallet(corpus, dictionary, 300, 
                         params={
                             'mallet_path': '../mallet-2.0.8/bin/mallet',
                             'prefix_path': '../artefacts/mallet_tmp/',
                             'prefix': '300_topics_mallet_alpha_50'
                         })

CPU times: user 16.5 s, sys: 137 ms, total: 16.6 s
Wall time: 2min 15s


In [25]:
get_coherence(model, docs, dictionary)

0.5301726081250121

## pyLDAvis Visualization

In [21]:
generate_ldavis(model, corpus, dictionary, '10_topics')

vis generated to ../references/10_topics.html


## Save Model and Dictionary

In [39]:
save_model(model, suffix='200_topics')

In [26]:
# other artefacts
dictionary.save('../artefacts/dictionary')

In [73]:
with open('../artefacts/corpus.pkl', 'wb') as f:
    pickle.dump(corpus, f)
    
with open('../artefacts/docs.pkl', 'wb') as f:
    pickle.dump(docs, f)

## Load 

In [24]:
lda_model = load_model(suffix='200_topics')

In [18]:
model = load_mallet_model(artefacts_path='../artefacts', suffix='300_topics_mallet_alpha_50')

In [19]:
model.mallet_path = '../mallet-2.0.8/bin/mallet'
model.prefix = '../artefacts/mallet_tmp/'

In [22]:
# for reproducibility: 0.5301726081250121
get_coherence(model, docs, dictionary)

0.5301726081250121

In [20]:
get_all_topics(model).head()

Unnamed: 0,Topic_Id,Topic_Keywords
0,0,"political, study, social, politics, discourse,..."
1,1,"disease, inflammatory, inflammation, therapeut..."
2,2,"delivery, release, deliver, based, develop, oa..."
3,3,"emission, fuel, gas, co2, carbon, reduction, c..."
4,4,"liver, development, sport, aim, major, event, ..."


## Predict and Format

In [27]:
# prediction of 1 document and sorting the topics by their distribution
sorted(model[corpus[0]], key=lambda x: x[1], reverse=True)[:10]

[(280, 0.13007478632478592),
 (139, 0.09054487179487151),
 (15, 0.065438034188034),
 (8, 0.03018162393162383),
 (19, 0.027510683760683673),
 (66, 0.026976495726495638),
 (39, 0.022702991452991383),
 (63, 0.014155982905982864),
 (247, 0.013087606837606798),
 (299, 0.010950854700854669)]

In [41]:
predict_df = predict_and_format_topics(ldamodel=model, corpus=corpus, texts=data)

In [45]:
df.loc[:,'Topic_Id'] = predict_df['Dominant_Topic']
df.loc[:,'Topic_Prob'] = predict_df['Topic_Prob']
df.loc[:,'Topic_Keywords'] = predict_df['Topic_Keywords']

In [46]:
df.head()

Unnamed: 0,rcn,id,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,call,fundingScheme,coordinator,coordinatorCountry,participants,participantCountries,subjects,Topic_Id,Topic_Prob,Topic_Keywords
0,207037,734211,INTERACT,SIGNED,H2020-EU.1.3.3.,MSCA-RISE-2016,H2020,The INTERnAtional network on Crisis Translation,2017-04-01,2020-03-31,...,H2020-MSCA-RISE-2016,MSCA-RISE,DUBLIN CITY UNIVERSITY,IE,THE COCHRANE COLLABORATION;UNIVERSITY COLLEGE ...,UK;PT,,280.0,0.1301,"language, speech, linguistic, word, natural, m..."
1,199028,686987,BrainHack,SIGNED,H2020-EU.1.2.1.,FETOPEN-CSA-FETEXCHANGE-2015,H2020,BrainHack: Bringing the arts and sciences of b...,2016-01-01,2017-12-31,...,H2020-FETOPEN-2015-CSA,CSA,TECHNISCHE UNIVERSITEIT DELFT,NL,"THE PROVOST, FELLOWS, FOUNDATION SCHOLARS & TH...",IE;PT;CZ;NL;IT;EE,,183.0,0.1108,"science, research, scientific, conference, sci..."
2,207221,733174,IMPACT TB,SIGNED,H2020-EU.3.1.6.,SC1-PM-21-2016,H2020,IMPACT TB: Implementing proven community-based...,2017-01-01,2019-12-31,...,H2020-SC1-2016-RTD,RIA,LIVERPOOL SCHOOL OF TROPICAL MEDICINE,UK,KONINKLIJKE NEDERLANDSE CENTRALE VERENIGING TO...,NL;NP;SE;DE,,63.0,0.1312,"health, intervention, care, mental, evidence, ..."
3,207786,700512,CortIMod,CLOSED,H2020-EU.1.3.2.,MSCA-IF-2015-EF,H2020,Implementation and Preliminary Validation of a...,2016-11-01,2018-11-27,...,H2020-MSCA-IF-2015,MSCA-IF-EF-ST,UNIVERSITY COLLEGE LONDON,UK,,,,150.0,0.0991,"brain, cognitive, disorder, neural, neuroscien..."
4,198320,676144,SyDAD,SIGNED,H2020-EU.1.3.1.,MSCA-ITN-2015-ETN,H2020,Synaptic Dysfunction in Alzheimer Disease,2015-11-01,2019-10-31,...,H2020-MSCA-ITN-2015,MSCA-ITN-ETN,KAROLINSKA INSTITUTET,SE,AXON NEUROSCIENCE SE;DEUTSCHES ZENTRUM FUR NEU...,SK;DE;FR;IT;BE,,15.0,0.1909,"training, research, academic, researcher, esr,..."


In [47]:
# df.to_csv('../output/cordis-h2020projects_with_topics.csv', index=False)

In [48]:
get_topic_most_dominant_document(predict_df).head()

Unnamed: 0,Topic_Num,Document_No,Topic_Prob,Topic_Keywords,Text
0,0.0,30013,0.4259,"political, study, social, politics, discourse,...","Protest and Order. Democratic theory, contenti..."
1,1.0,6696,0.3239,"disease, inflammatory, inflammation, therapeut...",Mechanisms and regulation of inflammasome-asso...
2,2.0,7298,0.2369,"delivery, release, deliver, based, develop, oa...",Biocompatible nanoparticles for T cell targete...
3,3.0,21361,0.3374,"emission, fuel, gas, co2, carbon, reduction, c...",Heavy Duty Gas Engines integrated into Vehicle...
4,4.0,17184,0.328,"liver, development, sport, aim, major, event, ...",EXercise as a regulator of hepatic NAD metabol...


In [49]:
get_topics_distribution(predict_df).head()

Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0,0.0,"political, study, social, politics, discourse,...",249,0.0083
1,1.0,"disease, inflammatory, inflammation, therapeut...",122,0.0041
2,2.0,"delivery, release, deliver, based, develop, oa...",55,0.0018
3,3.0,"emission, fuel, gas, co2, carbon, reduction, c...",211,0.007
4,4.0,"liver, development, sport, aim, major, event, ...",47,0.0016


In [50]:
_ = get_topics_distribution(predict_df)['Dominant_Topic'].values.tolist()

print([i for i in range(300) if i not in _])

[13, 65, 135, 158]


In [51]:
get_all_topics(model).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
Topic_Id,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
Topic_Keywords,"political, study, social, politics, discourse,...","disease, inflammatory, inflammation, therapeut...","delivery, release, deliver, based, develop, oa...","emission, fuel, gas, co2, carbon, reduction, c...","liver, development, sport, aim, major, event, ...","movement, home, people, rehabilitation, life, ...","chain, supply, business, market, sector, deman...","receptor, signalling, signaling, role, signal,...","public, citizen, community, engagement, privat...","ocean, marine, sea, ice, coastal, atlantic, sh...",...,"approach, diversity, understanding, diverse, r...","skin, pain, regeneration, wound, healing, inju...","oil, friendly, eco, industry, innovative, envi...","strategy, needed, target, anti, effective, ide...","data, database, information, web, tool, open, ...","primary, mining, secondary, rare, critical, ex...","magnetic, field, spin, magnet, resonance, elec...","virus, vaccine, disease, infection, viral, vec...","light, led, shed, lighting, colour, display, o...","research, science, philosophy, practice, relat..."


In [52]:
topics_df = get_all_topics(model)
dist_df = get_topics_distribution(predict_df)[['Dominant_Topic', 'Num_Documents', 'Perc_Documents']]
topics_df = topics_df.merge(dist_df, how='left', left_on='Topic_Id', right_on='Dominant_Topic').drop(columns='Dominant_Topic')

In [53]:
topics_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
Topic_Id,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
Topic_Keywords,"political, study, social, politics, discourse,...","disease, inflammatory, inflammation, therapeut...","delivery, release, deliver, based, develop, oa...","emission, fuel, gas, co2, carbon, reduction, c...","liver, development, sport, aim, major, event, ...","movement, home, people, rehabilitation, life, ...","chain, supply, business, market, sector, deman...","receptor, signalling, signaling, role, signal,...","public, citizen, community, engagement, privat...","ocean, marine, sea, ice, coastal, atlantic, sh...",...,"approach, diversity, understanding, diverse, r...","skin, pain, regeneration, wound, healing, inju...","oil, friendly, eco, industry, innovative, envi...","strategy, needed, target, anti, effective, ide...","data, database, information, web, tool, open, ...","primary, mining, secondary, rare, critical, ex...","magnetic, field, spin, magnet, resonance, elec...","virus, vaccine, disease, infection, viral, vec...","light, led, shed, lighting, colour, display, o...","research, science, philosophy, practice, relat..."
Num_Documents,249,122,55,211,47,139,58,126,103,151,...,6,98,42,22,88,55,138,226,76,100
Perc_Documents,0.0083,0.0041,0.0018,0.007,0.0016,0.0046,0.0019,0.0042,0.0034,0.005,...,0.0002,0.0033,0.0014,0.0007,0.0029,0.0018,0.0046,0.0075,0.0025,0.0033


In [166]:
# topics_df.to_csv('../output/topics_300.csv', index=False)

## Search Terms

In [56]:
sheets_dict = pd.read_excel('../data/Text mining word list test 200823.xlsx', None, header=None)
search_terms = sheets_dict['Sheet2']
search_terms = search_terms.values.flatten().tolist()

In [58]:
len(search_terms)

632

In [59]:
print(sample(search_terms, 10))

['gene regulatory', 'microbial', 'newborns', 'endonuclease', 'lymphoblastic leukemia', 'hemorrhage', 'mast cell', 'cardiac', 'autoimmune', 'medical device']


In [60]:
search_docs = process_data(search_terms)
search_docs[:5]

[['3d'], ['3d', 'genome'], ['3d', 'bioprinting'], ['acne'], ['acne', 'skin']]

In [61]:
# use list of vocabs as input to a dictionary
search_dict = create_dictionary(search_docs, filter_extreme=False)

In [62]:
# examining intersection between dictionary and search_dictionary
# terms in search_dict that are not in dictionary
set(search_dict.values()) - set(dictionary.values()).intersection(set(search_dict.values()))

{'adipogenesis',
 'amnesic',
 'analgesia',
 'autoinflammation',
 'colo',
 'exonuclease',
 'flagellin',
 'funtional',
 'gluconeogenesis',
 'jaundice',
 'mait',
 'micropeptides',
 'multiomics',
 'muscoloskeletal',
 'myeloablation',
 'neuromotor',
 'nutrigenomics',
 'postpartum',
 'psychomotor',
 'rheumatic'}

In [63]:
len(set(search_dict.values()) - set(dictionary.values()).intersection(set(search_dict.values())))

20

In [64]:
dictionary.doc2idx(['amnesic'])

[-1]

### Examining topics extracted from searching words in dictionary

In [66]:
# examining topics extracted from searching words in dictionary
search_term_predictions = {}
temp_model = ldamallet.malletmodel2ldamodel(model)
for i in search_dict.values():
    results = get_term_topics(temp_model, dictionary, i)
    if results:    # also removes empty results
        search_term_predictions[i] = results

In [67]:
len(search_term_predictions)

269

In [68]:
# 269 out of 569 search tokens (569 - 20 tokens in search_dict that are not in dictionary) return a topic from the model
len(search_term_predictions) / len(search_dict)

0.4727592267135325

__Get topics and key words__

In [69]:
search_terms_topic_df = format_term_search_results(model, search_term_predictions)
search_terms_topic_df.head(10)

Unnamed: 0,Search_Term,Topic_ID,Topic_Prob,Topic_Keywords
0,3d,44,0.257112,"3d, printing, technology, printed, based, ink,..."
1,genome,54,0.052382,"gene, genetic, genome, expression, mutation, m..."
2,genome,103,0.030095,"dna, epigenetic, chromatin, replication, genom..."
3,skin,291,0.063733,"skin, pain, regeneration, wound, healing, inju..."
4,acoustic,226,0.051384,"sound, music, acoustic, performance, creative,..."
5,biomaterials,111,0.015502,"bone, implant, joint, tissue, cartilage, bioma..."
6,active,101,0.200965,"active, based, aim, passive, develop, proposed..."
7,ageing,266,0.066623,"life, ageing, people, age, population, aging, ..."
8,adverse,86,0.010158,"effect, impact, side, study, affect, outcome, ..."
9,effect,86,0.305103,"effect, impact, side, study, affect, outcome, ..."


In [165]:
search_terms_topic_df.to_csv('../output/search_terms_to_topic.csv', index=False)

__More important keywords highlighted by Wenyu__

In [70]:
search_dict.token2id['crispr']

121

In [71]:
search_dict.token2id['rna']

125

In [72]:
search_terms_topic_df[search_terms_topic_df['Search_Term'] == 'crispr']

Unnamed: 0,Search_Term,Topic_ID,Topic_Prob,Topic_Keywords
129,crispr,54,0.017155,"gene, genetic, genome, expression, mutation, m..."


In [73]:
search_terms_topic_df[search_terms_topic_df['Search_Term'] == 'rna']

Unnamed: 0,Search_Term,Topic_ID,Topic_Prob,Topic_Keywords
133,rna,189,0.077636,"protein, rna, mrna, function, gene, expression..."


In [74]:
model.show_topic(189)

[('protein', 0.1646944316377235),
 ('rna', 0.07765995594487987),
 ('mrna', 0.02110547615388556),
 ('function', 0.021054249270016905),
 ('gene', 0.01946621587008862),
 ('expression', 0.019005173915270734),
 ('modification', 0.01849290507658419),
 ('cell', 0.01654628348957533),
 ('translation', 0.016034014650888787),
 ('cellular', 0.014343527483223196)]

In [76]:
model.show_topic(54)

[('gene', 0.12105997210599721),
 ('genetic', 0.06606893803546524),
 ('genome', 0.05240087666865909),
 ('expression', 0.036182506475393506),
 ('mutation', 0.02976688583383144),
 ('molecular', 0.02327156804144252),
 ('genomic', 0.020800956365809923),
 ('crispr', 0.017174736003187887),
 ('identify', 0.01701534170153417),
 ('sequencing', 0.016497310221159595)]

### Search terms

In [53]:
sheets_dict = pd.read_excel('../data/Text mining word list test 200823.xlsx', None, header=None)
search_terms = sheets_dict['Sheet2']
search_terms = search_terms.values.flatten().tolist()
search_docs = process_data(search_terms)

# use list of vocabs as input to a dictionary
search_dict = create_dictionary(search_docs, filter_extreme=False)

In [58]:
# examining topics extracted from searching words in dictionary
search_term_predictions = {}
temp_model = ldamallet.malletmodel2ldamodel(model)
for i in search_dict.values():
    results = get_term_topics(temp_model, filtered_dict, i)
    if results:    # also removes empty results
        search_term_predictions[i] = results

In [60]:
print(len(search_term_predictions))

# 269 out of 569 search tokens (569 - 20 tokens in search_dict that are not in dictionary) return a topic from the model
len(search_term_predictions) / len(search_dict)

326


0.5729349736379613

In [56]:
print(get_term_topics(temp_model, filtered_dict, 'rna'))
print(topics_df[topics_df['Topic_Id'] == 111]['Topic_Keywords'].values)

[(111, 0.1469877343272862)]
['rna, mrna, translation, coding, function, regulation, expression, small, mirnas, protein']


In [57]:
print(get_term_topics(temp_model, filtered_dict, 'crispr'))
print(topics_df[topics_df['Topic_Id'] == 40]['Topic_Keywords'].values)

[(40, 0.02971713213547371)]
['gene, genome, expression, genetic, mutation, crispr, editing, functional, identify, cas9']


In [63]:
print(get_term_topics(temp_model, filtered_dict, '3d'))
print(topics_df[topics_df['Topic_Id'] == 144]['Topic_Keywords'].values)
print(topics_df[topics_df['Topic_Id'] == 293]['Topic_Keywords'].values)

[(144, 0.2478588130163367), (293, 0.016329431336412268)]
['3d, image, object, camera, vision, computer, 2d, dimensional, reconstruction, capture']
['tissue, regeneration, cell, regenerative, scaffold, vitro, biomaterials, oa, engineering, repair']


In [None]:
# sample words not found
search_term_predictions = {}
temp_model = ldamallet.malletmodel2ldamodel(model)
for i in search_dict.values():
    results = get_term_topics(temp_model, filtered_dict, i)
    if results:    # also removes empty results
        search_term_predictions[i] = results

In [68]:
len([i for i in search_dict.values() if i not in search_term_predictions.keys()])

243

In [73]:
print(sample([i for i in search_dict.values() if i not in search_term_predictions.keys()], 20))

['mesenchymal', 'neurotransmitter', 'myopia', 'noncoding', 'neuromotor', 'flu', 'dendritic', 'borne', 'endonuclease', 'biomanufacturing', 'amnesic', 'organoid', 'activated', 'micropeptides', 'cardiometabolic', 'neoplasm', 'bowel', 'hypoxic', 'reticulum', 'allogeneic']


In [77]:
_ = pd.read_csv('../output/cordis-h2020projects_with_topics.csv')
old_topics_df = pd.read_csv('../output/topics_300.csv')

In [102]:
old_topics_df.loc[[54]]

Unnamed: 0,Topic_Id,Topic_Keywords,Num_Documents,Perc_Documents
54,54,"gene, genetic, genome, expression, mutation, m...",179.0,0.006


In [100]:
len(_.loc[['crispr' in i for i in _['Topic_Keywords']]])

179

In [112]:
set(_.loc[['crispr' in i for i in _['Topic_Keywords']]].index.values)

{15,
 162,
 290,
 444,
 457,
 869,
 946,
 994,
 995,
 1001,
 1005,
 1007,
 1008,
 1017,
 1093,
 1224,
 1225,
 1392,
 1625,
 1878,
 1879,
 1976,
 3626,
 3916,
 4247,
 4263,
 4380,
 4505,
 4514,
 4515,
 5137,
 5161,
 5189,
 5657,
 5832,
 6427,
 6432,
 6436,
 6562,
 6567,
 6570,
 6574,
 6576,
 6583,
 6588,
 6597,
 6598,
 6619,
 6689,
 6854,
 6870,
 6886,
 6891,
 7131,
 7316,
 7500,
 7595,
 7669,
 8493,
 8650,
 9060,
 9062,
 9071,
 9075,
 9076,
 9085,
 9088,
 9089,
 9091,
 9107,
 9363,
 9418,
 9666,
 9848,
 10004,
 10014,
 10015,
 10017,
 10023,
 10034,
 10374,
 10390,
 10507,
 10515,
 10601,
 10618,
 10669,
 10738,
 10743,
 10826,
 11066,
 11514,
 11682,
 11737,
 11804,
 11823,
 12179,
 13482,
 13563,
 15009,
 15040,
 15552,
 15627,
 15693,
 15818,
 16664,
 16706,
 17521,
 18168,
 18288,
 18370,
 18378,
 18600,
 18671,
 19143,
 19144,
 19159,
 19162,
 19382,
 19385,
 19940,
 20001,
 20179,
 20517,
 20627,
 20786,
 21316,
 21602,
 21694,
 22367,
 22368,
 22428,
 22454,
 22461,
 22462,
 224

In [113]:
_.head()

Unnamed: 0,rcn,id,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,call,fundingScheme,coordinator,coordinatorCountry,participants,participantCountries,subjects,Topic_Id,Topic_Prob,Topic_Keywords
0,207037,734211,INTERACT,SIGNED,H2020-EU.1.3.3.,MSCA-RISE-2016,H2020,The INTERnAtional network on Crisis Translation,2017-04-01,2020-03-31,...,H2020-MSCA-RISE-2016,MSCA-RISE,DUBLIN CITY UNIVERSITY,IE,THE COCHRANE COLLABORATION;UNIVERSITY COLLEGE ...,UK;PT,,280.0,0.1301,"language, speech, linguistic, word, natural, m..."
1,199028,686987,BrainHack,SIGNED,H2020-EU.1.2.1.,FETOPEN-CSA-FETEXCHANGE-2015,H2020,BrainHack: Bringing the arts and sciences of b...,2016-01-01,2017-12-31,...,H2020-FETOPEN-2015-CSA,CSA,TECHNISCHE UNIVERSITEIT DELFT,NL,"THE PROVOST, FELLOWS, FOUNDATION SCHOLARS & TH...",IE;PT;CZ;NL;IT;EE,,183.0,0.1108,"science, research, scientific, conference, sci..."
2,207221,733174,IMPACT TB,SIGNED,H2020-EU.3.1.6.,SC1-PM-21-2016,H2020,IMPACT TB: Implementing proven community-based...,2017-01-01,2019-12-31,...,H2020-SC1-2016-RTD,RIA,LIVERPOOL SCHOOL OF TROPICAL MEDICINE,UK,KONINKLIJKE NEDERLANDSE CENTRALE VERENIGING TO...,NL;NP;SE;DE,,63.0,0.1312,"health, intervention, care, mental, evidence, ..."
3,207786,700512,CortIMod,CLOSED,H2020-EU.1.3.2.,MSCA-IF-2015-EF,H2020,Implementation and Preliminary Validation of a...,2016-11-01,2018-11-27,...,H2020-MSCA-IF-2015,MSCA-IF-EF-ST,UNIVERSITY COLLEGE LONDON,UK,,,,150.0,0.0991,"brain, cognitive, disorder, neural, neuroscien..."
4,198320,676144,SyDAD,SIGNED,H2020-EU.1.3.1.,MSCA-ITN-2015-ETN,H2020,Synaptic Dysfunction in Alzheimer Disease,2015-11-01,2019-10-31,...,H2020-MSCA-ITN-2015,MSCA-ITN-ETN,KAROLINSKA INSTITUTET,SE,AXON NEUROSCIENCE SE;DEUTSCHES ZENTRUM FUR NEU...,SK;DE;FR;IT;BE,,15.0,0.1909,"training, research, academic, researcher, esr,..."


In [127]:
data[15]

'Revealing Allele-level Regulation and Dynamics using Single-cell Gene Expression Analyses As diploid organisms inherit one gene copy from each parent, a gene can be expressed from both alleles (biallelic) or from only one allele (monoallelic). Although transcription from both alleles is detected for most genes in cell population experiments, little is known about allele-specific expression in single cells and its phenotypic consequences. To answer fundamental questions about allelic transcription heterogeneity in single cells, this research program will focus on single-cell transcriptome analyses with allelic-origin resolution. To this end, we will investigate both clonally stable and dynamic random monoallelic expression across a large number of cell types, including cells from embryonic and adult stages. This research program will be accomplished with the novel single-cell RNA-seq method developed within my lab to obtain quantitative, genome-wide gene expression measurement. To dist

In [136]:
get_all_topics(model)[['crispr' in i for i in get_all_topics(model)['Topic_Keywords']]]

Unnamed: 0,Topic_Id,Topic_Keywords
40,40,"gene, genome, expression, genetic, mutation, c..."


In [114]:
predict_df.head()

Unnamed: 0,Document_No,Dominant_Topic,Topic_Prob,Topic_Keywords,Text
0,0,154.0,0.1402,"web, knowledge, search, translation, informati...",The INTERnAtional network on Crisis Translatio...
1,1,297.0,0.3003,"intervention, country, implementation, effecti...",IMPACT TB: Implementing proven community-based...
2,2,219.0,0.1155,"motor, movement, muscle, rehabilitation, recov...",Implementation and Preliminary Validation of a...
3,3,215.0,0.1571,"training, research, researcher, esr, skill, pr...",Synaptic Dysfunction in Alzheimer Disease Give...
4,4,11.0,0.1028,"epigenetic, chromatin, methylation, modificati...",Reversing the epigenetic state of oligodendroc...


In [139]:
set(predict_df.loc[['crispr' in i for i in predict_df['Topic_Keywords']]]['Document_No'].values)

{15,
 290,
 910,
 988,
 995,
 1008,
 1017,
 1093,
 1224,
 1392,
 1879,
 3831,
 3916,
 4515,
 5657,
 5832,
 6432,
 6514,
 6566,
 6567,
 6570,
 6574,
 6597,
 6619,
 6854,
 6891,
 7316,
 7500,
 7595,
 8493,
 9062,
 9070,
 9071,
 9076,
 9091,
 9097,
 9109,
 9358,
 9363,
 10515,
 10601,
 10669,
 10738,
 11823,
 12166,
 13358,
 15552,
 15818,
 16316,
 16343,
 16664,
 16706,
 17521,
 18288,
 18381,
 19385,
 20371,
 20627,
 22367,
 22372,
 22447,
 22461,
 22462,
 22525,
 22613,
 23236,
 23257,
 23777,
 23828,
 25245,
 25246,
 25252,
 25262,
 25276,
 25293,
 25298,
 25299,
 25323,
 25916,
 26068,
 26598,
 26651,
 26760,
 27302,
 27323,
 27379,
 28320,
 28334,
 28374,
 28377,
 28446,
 29007,
 29300,
 29307,
 29712}

In [140]:
sorted(model[filtered_dict.doc2bow(docs[162])], key=lambda x: x[1], reverse=True)

[(7, 0.08553355335533581),
 (183, 0.07398239823982422),
 (190, 0.05583058305830603),
 (36, 0.044829482948294974),
 (122, 0.03382838283828393),
 (251, 0.02502750275027511),
 (15, 0.025027502750275106),
 (220, 0.0228272827282729),
 (109, 0.021727172717271797),
 (175, 0.021177117711771252),
 (267, 0.019526952695269592),
 (81, 0.017876787678767936),
 (297, 0.017326732673267384),
 (56, 0.016776677667766835),
 (75, 0.016226622662266283),
 (153, 0.016226622662266283),
 (129, 0.014576457645764627),
 (40, 0.01017601760176021),
 (171, 0.008525852585258552),
 (159, 0.007975797579758002),
 (166, 0.007975797579758002),
 (2, 0.007975797579758),
 (254, 0.00742574257425745),
 (52, 0.006875687568756898),
 (103, 0.006875687568756898),
 (147, 0.005775577557755794),
 (198, 0.005225522552255242),
 (48, 0.005225522552255241),
 (141, 0.005225522552255241),
 (258, 0.005225522552255241),
 (208, 0.004675467546754691),
 (13, 0.004675467546754689),
 (127, 0.004675467546754689),
 (257, 0.00412541254125414),
 (11, 

In [141]:
data[162]

'Molecular epidemiology of Bacillus anthracis: novel data and techniques for local surveillance in Tanzania Anthrax is described by the World Health Organization as a disease that “perpetuates poverty by attacking not only people’s health but also their livelihoods.” In the Serengeti region of Tanzania, where income is largely based on agriculture and tourism, regular outbreaks of anthrax in both livestock and wildlife have devastating impacts. Understanding and controlling the spread of Bacillus anthracis, the bacterial agent causing anthrax, in this environment is currently impeded by a lack of data on the genetic diversity and appropriate typing schemes to resolve fine-scale genetic differences. I propose to quantify the genomic diversity of B. anthracis in the Serengeti region of Tanzania and to use these data to develop molecular diagnostic and genotyping tools that can be implemented locally to facilitate surveillance. First, whole-genome sequencing will be performed on isolates 