# Topic Model For Abstracts

## Prepare data for LDA Analysis

In [1]:
import pandas as pd
import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords
from pprint import pprint
#nltk.download('stopwords')



In [2]:
papers = pd.read_csv('./Dataset_00_02_10_cleaned.csv')
papers.head(5)

Unnamed: 0,Title,Year,Cited by,Affiliations,Abstract,Author Keywords,Index Keywords,paper_Title_processed,paper_Abstract_processed
0,Connectionism and cognitive architecture: A cr...,1988,1606.0,"CUNY Graduate Center, United States; Universit...",This paper explores differences between Connec...,,article; brain; cognition; human; linguistics;...,connectionism and cognitive architecture a cri...,this paper explores differences between connec...
1,Implicit Learning and Tacit Knowledge,1989,1213.0,"Brooklyn College, the Graduate Center, City Un...","I examine the phenomenon of implicit learning,...",,,implicit learning and tacit knowledge,i examine the phenomenon of implicit learning ...
2,Stable Adaptive Teleoperation,1991,1091.0,"Nonlinear Systems Laboratory, Massachusetts In...","Telerobotics, the body of science and technolo...",,"Control Systems, Cascade; Control Systems, Del...",stable adaptive teleoperation,telerobotics the body of science and technolog...
3,Hippocampal synaptic enhancement and informati...,1987,885.0,"Department of Psychology, University of Colora...",The hypothesis that the physical substrate of ...,,animal experiment; central nervous system; ele...,hippocampal synaptic enhancement and informati...,the hypothesis that the physical substrate of ...
4,Catastrophic Interference in Connectionist Net...,1989,803.0,,Connectionist networks in which information is...,,,catastrophic interference in connectionist net...,connectionist networks in which information is...


In [4]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

## TF Model

In [5]:
data = papers.paper_Abstract_processed.values.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)

# Create Dictionary
id2word_Abstract = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
bow_corpus_Abstract = [id2word_Abstract.doc2bow(text) for text in texts]
# View
print(bow_corpus_Abstract[:1])

[[(0, 2), (1, 1), (2, 1), (3, 1), (4, 3), (5, 1), (6, 4), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 2), (14, 2), (15, 5), (16, 1), (17, 2), (18, 1), (19, 1), (20, 2), (21, 3), (22, 1), (23, 1), (24, 1), (25, 1), (26, 2), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 2), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 2), (41, 1), (42, 1), (43, 1), (44, 3), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 2), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 2), (68, 2), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 2), (76, 1), (77, 1)]]


## TF-IDF Model

In [8]:
tfidf_Abstract = gensim.models.TfidfModel(bow_corpus_Abstract)
corpus_tfidf_Abstract = tfidf_Abstract[bow_corpus_Abstract]

for doc in corpus_tfidf_Abstract:
    pprint(doc)
    break

[(0, 0.12660560821440567),
 (1, 0.05255603261545527),
 (2, 0.0518608998566745),
 (3, 0.08699505543653467),
 (4, 0.20551218888919873),
 (5, 0.09371985568182208),
 (6, 0.2930554951630226),
 (7, 0.09301276614304653),
 (8, 0.02942684865226227),
 (9, 0.047957380752038405),
 (10, 0.08627358168811329),
 (11, 0.05646937565718306),
 (12, 0.0664118650166388),
 (13, 0.1461990751010211),
 (14, 0.14752927319492035),
 (15, 0.01494864392819303),
 (16, 0.09259952136521116),
 (17, 0.2614614765505698),
 (18, 0.11768299383990025),
 (19, 0.07633466094396915),
 (20, 0.20498603493187753),
 (21, 0.2715649031735344),
 (22, 0.06876953132046756),
 (23, 0.096809537530249),
 (24, 0.06547626655936532),
 (25, 0.07784835306027992),
 (26, 0.3173119014356377),
 (27, 0.10042523323947776),
 (28, 0.07665468858914265),
 (29, 0.07107119992915416),
 (30, 0.10022908957118584),
 (31, 0.09818301002519965),
 (32, 0.05975825139416869),
 (33, 0.1335471957468254),
 (34, 0.08214166262855715),
 (35, 0.10143402101727657),
 (36, 0.067

## Running LDA using Bag of Words

In [9]:
num_topics = 10
# Build LDA model
lda_model_Abstract_bow = gensim.models.LdaMulticore(corpus=bow_corpus_Abstract,
                                       id2word=id2word_Abstract,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(lda_model_Abstract_bow.print_topics())
doc_lda_Abstract_bow = lda_model_Abstract_bow[bow_corpus_Abstract]

[(0,
  '0.027*"cognitive" + 0.017*"science" + 0.011*"language" + 0.008*"metaphor" + '
  '0.007*"human" + 0.006*"theory" + 0.005*"cognition" + 0.005*"paper" + '
  '0.005*"research" + 0.004*"processing"'),
 (1,
  '0.021*"cognitive" + 0.012*"science" + 0.007*"research" + 0.005*"sciences" + '
  '0.005*"human" + 0.005*"new" + 0.004*"brain" + 0.004*"cognition" + '
  '0.004*"social" + 0.004*"knowledge"'),
 (2,
  '0.023*"learning" + 0.011*"cognitive" + 0.007*"science" + 0.007*"model" + '
  '0.007*"based" + 0.006*"data" + 0.005*"results" + 0.004*"study" + '
  '0.004*"training" + 0.004*"research"'),
 (3,
  '0.010*"brain" + 0.010*"emotions" + 0.009*"cognitive" + 0.009*"emotion" + '
  '0.008*"memory" + 0.006*"emotional" + 0.005*"science" + 0.005*"based" + '
  '0.005*"recognition" + 0.004*"different"'),
 (4,
  '0.015*"cognitive" + 0.007*"study" + 0.007*"science" + 0.006*"research" + '
  '0.006*"children" + 0.005*"studies" + 0.005*"results" + 0.005*"language" + '
  '0.005*"self" + 0.003*"two"'),
 (5

## Running LDA using TF-IDF

In [10]:
num_topics = 10
# Build LDA model
lda_model_Abstract_tfidf = gensim.models.LdaMulticore(corpus=corpus_tfidf_Abstract,
                                       id2word=id2word_Abstract,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(lda_model_Abstract_tfidf.print_topics())
doc_lda_Abstract_tfidf = lda_model_Abstract_tfidf[corpus_tfidf_Abstract]

[(0,
  '0.002*"self" + 0.002*"disorders" + 0.002*"symptoms" + 0.001*"sciences" + '
  '0.001*"research" + 0.001*"article" + 0.001*"philosophy" + '
  '0.001*"psychiatry" + 0.001*"th" + 0.001*"mind"'),
 (1,
  '0.002*"universidad" + 0.002*"od" + 0.001*"elderly" + 0.001*"abduction" + '
  '0.001*"synaesthesia" + 0.001*"blind" + 0.001*"learning" + '
  '0.001*"identifications" + 0.001*"trees" + 0.001*"analytics"'),
 (2,
  '0.012*"religious" + 0.012*"religion" + 0.009*"performativity" + '
  '0.005*"ritual" + 0.004*"god" + 0.004*"religions" + 0.004*"supernatural" + '
  '0.003*"gods" + 0.003*"theology" + 0.003*"belief"'),
 (3,
  '0.139*"abstract" + 0.134*"available" + 0.002*"teleology" + 0.002*"poetry" + '
  '0.002*"anne" + 0.002*"citation" + 0.002*"fertile" + 0.001*"verse" + '
  '0.001*"revista" + 0.001*"er"'),
 (4,
  '0.002*"synaesthetic" + 0.002*"bci" + 0.002*"workshop" + 0.002*"criminal" + '
  '0.002*"legal" + 0.002*"ui" + 0.002*"supernatural" + 0.001*"governance" + '
  '0.001*"forensic" + 0.

## Analyzing LDA model results

### 1. Bag of Words Model

In [11]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


In [13]:
lda_viz_Abstract_bow = gensimvis.prepare(lda_model_Abstract_bow, bow_corpus_Abstract, id2word_Abstract)
pyLDAvis.save_html(lda_viz_Abstract_bow, './web/bow_Abstract_'+ str(num_topics) +'.html')
lda_viz_Abstract_bow

  and should_run_async(code)


### 2. TF-IDF Model

In [14]:
lda_viz_Abstract_tfidf = gensimvis.prepare(lda_model_Abstract_tfidf, corpus_tfidf_Abstract, id2word_Abstract)
pyLDAvis.save_html(lda_viz_Abstract_tfidf, './web/tfidf_Abstract_'+ str(num_topics) +'.html')
lda_viz_Abstract_tfidf

  and should_run_async(code)
