In [1]:
import os
import glob
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from gensim import corpora
from gensim import models
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
import pickle



In [3]:
file = os.getcwd() + r'\article_info.xlsx'

In [4]:
df = pd.read_excel(file, usecols=['year','title','abstract','country'])

In [5]:
df.head()

Unnamed: 0,year,title,abstract,country
0,1997,HBV genome integration and genetic instability...,The aim of this study is to clarify the existe...,Ireland
1,1997,Clinical manifestations of hepatitis C.,Hepatitis C is a common cause of viral hepatit...,United States
2,1998,Hepatobiliary malignancy.,Chronic cholestatic liver disease may be compl...,United States
3,1999,Experimental hepatic tumorigenicity by environ...,There is an evident need of low-cost vertebrat...,United States
4,1999,Development of test systems for the detection ...,"Over the past decades, strong efforts have bee...",United States


In [6]:
df = df[df['abstract'].isna()!=True]
df = df[df.year<=2020]
df = df.drop_duplicates(['abstract'])
df = df.reset_index(drop=True)

In [7]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [8]:
texts=[]
for abst in df["abstract"]:
    texts.append(abst.strip())

In [9]:
tokenizer = RegexpTokenizer(r'[A-Za-z0-9-]+')

In [10]:
tokens=[]
for text in tqdm(texts):
    token = tokenizer.tokenize(text)
    stem_token = [stemmer.stem(word) for word in token if len(word)>1]
    without_stopwords_token = [word for word in stem_token if word not in stop_words and not(word.isdigit())]
    tokens.append(without_stopwords_token)

  0%|          | 0/61891 [00:00<?, ?it/s]

In [11]:
dictionary = corpora.Dictionary(tokens)

In [12]:
corpus = [dictionary.doc2bow(doc) for doc in tokens]

In [13]:
num_topics = 50
chunksize = 2000
n_passes = 30
n_iteration = 1000

In [14]:
import time

In [15]:
start = time.time()
lda = LdaMulticore(corpus=corpus,iterations=n_iteration, id2word=dictionary, num_topics=num_topics, passes=n_passes,per_word_topics=True)
print(time.time()-start)

489.5866873264313


In [16]:
cm = CoherenceModel(model=lda, corpus=corpus, coherence='u_mass')
coherence = cm.get_coherence()
print('Coherence', coherence)
print('Perplexity', lda.log_perplexity(corpus))

Coherence -3.6311974066912978
Perplexity -7.79262724796969


In [None]:
lda.save('lda.model')

In [None]:
with open('corpus', 'wb') as f:
    pickle.dump(corpus, f)

In [None]:
#load
lda =  models.LdaModel.load('lda.model')
with open('corpus', 'rb') as f:
    corpus = pickle.load(f)

In [17]:
#doc_topic matrix
gamma, _ = lda.inference(corpus)
doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]
doc_topic_dists

array([[0.00010706, 0.00010706, 0.00010706, ..., 0.00010706, 0.00010706,
        0.00010706],
       [0.00039334, 0.00039334, 0.00039334, ..., 0.02299794, 0.00039334,
        0.00039334],
       [0.00032275, 0.00032275, 0.00032275, ..., 0.00032275, 0.00032275,
        0.0176259 ],
       ...,
       [0.0001334 , 0.0001334 , 0.0001334 , ..., 0.0001334 , 0.0001334 ,
        0.0001334 ],
       [0.00012669, 0.00012669, 0.00012669, ..., 0.00012669, 0.00012669,
        0.00012669],
       [0.00012998, 0.00012998, 0.00012998, ..., 0.00012998, 0.00012998,
        0.00012998]], dtype=float32)

In [18]:
import numpy as np

In [19]:
arr = np.array(doc_topic_dists[0])

In [20]:
arr.argsort()[-2:][::-1]

array([43, 19], dtype=int64)

In [21]:
#topic_num, word list
topic_info = []
for num, topic in lda.print_topics(num_topics=50, num_words=30):
    words = []
    for word in topic.split('+'):
        raw = word.strip()
        word = raw.split('*')[1].replace('"','')
        words.append(word)
    topic_words = ' '.join(words)
    #print(topic_words)
    topic_info.append((num, topic_words))

In [22]:
topic_df = pd.DataFrame(topic_info, columns=['topic_num', 'terms'])

In [23]:
doc_topic_matrix = np.array(doc_topic_dists)

In [24]:
main_topic=[]
sub_topic=[]
for topic_vec in doc_topic_matrix:
    top2 = topic_vec.argsort()[-2:][::-1]
    main_topic.append(top2[0])
    sub_topic.append(top2[1])

In [26]:
#num of docs per topic
num_of_docs_by_topic = []
for i in range(50):
    num_of_docs_by_topic.append(0)

for t_num in main_topic:
    num_of_docs_by_topic[t_num]+=1    

In [27]:
topic_df['num_of_docs'] = num_of_docs_by_topic

In [28]:
topic_df.head()

Unnamed: 0,topic_num,terms,num_of_docs
0,0,wa activ human cell ifn-alpha express copper c...,303
1,1,cell protein wa human bind thi hepatoma membra...,1039
2,2,gpc3 wa fa cell circ circrna hsa line show ck ...,64
3,3,bile hcc duct metastasi icc intrahepat bone in...,42
4,4,hcc treatment thi cancer therapi ha liver clin...,3701


In [29]:
#topic EDA summary
print(topic_df['num_of_docs'].max())
print(topic_df['num_of_docs'].mean())
print(topic_df['num_of_docs'].min())

6425
1237.82
8


In [30]:
topic_df.to_excel('topic_terms.xlsx', index=False)

In [31]:
out_df = pd.DataFrame(df, columns=['year','title','country'])

In [32]:
out_df['topic']=main_topic
out_df['sub_topic']=sub_topic
out_df = out_df[['topic', 'title', 'year', 'country','sub_topic']]

In [33]:
out_df.to_excel('topic_doc.xlsx', index=False)