In [1]:
from gensim import corpora
from os.path import join
from os import listdir
import src.constants as const
import pandas as pd
import seaborn as sb

sb.set()
files = [join(const.JOURNALS_DIR, f) for f in listdir(const.JOURNALS_DIR)]
path = join(const.ARTIFACTS_DIR, "preprocessed.pkl")
df = pd.read_pickle(path)

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/ki/projects/scopus/scopus-
[nltk_data]     mining/src/../data/cache...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/ki/projects/scopus/scopus-
[nltk_data]     mining/src/../data/cache...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ki/projects/scopus/scopus-
[nltk_data]     mining/src/../data/cache...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
n_docs = (~df["dc:description:tokenized:gensim"].isnull()).sum()
texts = df["dc:description:tokenized:gensim"] # .apply(preprocess)

print("Creating dict")
dictionary = corpora.Dictionary(texts)

print("Creating corpus")
corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in texts]


Creating dict
Creating corpus


In [3]:
dictionary = corpora.Dictionary(texts)

In [4]:
mydict = corpora.Dictionary()
corpus = [mydict.doc2bow(doc, allow_update=True) for doc in texts]


In [5]:
from gensim import models
tfidf = models.TfidfModel(corpus, smartirs='ntc')

In [7]:
from gensim.models import LdaModel, LdaMulticore

lda_model = LdaMulticore(corpus=corpus,
                         id2word=dictionary,
                         random_state=100,
                         num_topics=7,
                         passes=10,
                         chunksize=1000,
                         batch=False,
                         alpha='asymmetric',
                         decay=0.5,
                         offset=64,
                         eta=None,
                         eval_every=0,
                         iterations=100,
                         gamma_threshold=0.001,
                         per_word_topics=True)

In [8]:
lda_model.save(join(const.MODELS_DIR, "lda-gensim"))

In [9]:
lda_model.print_topics(-1)

[(0,
  '0.061*"social" + 0.044*"work" + 0.019*"practice" + 0.017*"research" + 0.014*"article" + 0.012*"student" + 0.011*"worker" + 0.011*"education" + 0.008*"community" + 0.006*"study"'),
 (1,
  '0.029*"woman" + 0.017*"program" + 0.014*"study" + 0.010*"group" + 0.008*"social" + 0.008*"violence" + 0.007*"health" + 0.007*"hiv" + 0.007*"sexual" + 0.006*"community"'),
 (2,
  '0.028*"american" + 0.018*"scale" + 0.018*"african" + 0.016*"validity" + 0.015*"study" + 0.012*"result" + 0.011*"ebp" + 0.011*"analysis" + 0.010*"factor" + 0.010*"measure"'),
 (3,
  '0.075*"child" + 0.029*"family" + 0.023*"parent" + 0.013*"study" + 0.010*"mother" + 0.009*"welfare" + 0.009*"foster" + 0.009*"service" + 0.008*"support" + 0.007*"relationship"'),
 (4,
  '0.011*"eating" + 0.011*"et" + 0.010*"en" + 0.009*"mediation" + 0.004*"social" + 0.004*"og" + 0.004*"movie" + 0.004*"acculturative" + 0.004*"binge" + 0.003*"al"'),
 (5,
  '0.026*"social" + 0.016*"student" + 0.015*"person" + 0.013*"study" + 0.012*"work" + 0.0

In [10]:
for c in lda_model[corpus[:10]]:
    print("Document Topics      : ", c[0])      # [(Topics, Perc Contrib)]
    print("Word id, Topics      : ", c[1][:3])  # [(Word id, [Topics])]
    print("Phi Values (word id) : ", c[2][:2])  # [(Word id, [(Topic, Phi Value)])]
    print("Word, Topics         : ", [(dictionary[wd], topic) for wd, topic in c[1][:2]])   # [(Word, [Topics])]
    print("Phi Values (word)    : ", [(dictionary[wd], topic) for wd, topic in c[2][:2]])  # [(Word, [(Topic, Phi Value)])]
    print("------------------------------------------------------\n")

Document Topics      :  [(0, 0.59375644), (2, 0.13830768), (5, 0.2624757)]
Word id, Topics      :  [(0, [0, 5]), (1, [0, 5]), (2, [0, 5])]
Phi Values (word id) :  [(0, [(0, 0.9561347), (5, 0.04251078)]), (1, [(0, 0.9299628), (5, 0.062469766)])]
Word, Topics         :  [('abroad', [0, 5]), ('advocacy', [0, 5])]
Phi Values (word)    :  [('abroad', [(0, 0.9561347), (5, 0.04251078)]), ('advocacy', [(0, 0.9299628), (5, 0.062469766)])]
------------------------------------------------------

Document Topics      :  [(0, 0.98623514)]
Word id, Topics      :  [(13, [0]), (19, [0]), (20, [0])]
Phi Values (word id) :  [(13, [(0, 3.9997697)]), (19, [(0, 1.9999403)])]
Word, Topics         :  [('critical', [0]), ('education', [0])]
Phi Values (word)    :  [('critical', [(0, 3.9997697)]), ('education', [(0, 1.9999403)])]
------------------------------------------------------

Document Topics      :  [(0, 0.9887806)]
Word id, Topics      :  [(22, [0]), (36, [0]), (39, [0])]
Phi Values (word id) :  [(22

In [22]:
from gensim.models.wrappers import LdaMallet

mallet_path = "/home/ki/crypt/git/Mallet/bin/mallet"

lda_mallet = LdaMallet(mallet_path,
                       corpus=corpus,
                       num_topics=10,
                       alpha=50,
                       id2word=dictionary,
                       workers=10,
                       prefix=None,
                       optimize_interval=0,
                       iterations=1000,
                       topic_threshold=0.0,
                       random_seed=0)

In [23]:
lda_mallet.print_topics(-1)

[(0,
  '0.056*"research" + 0.049*"practice" + 0.033*"article" + 0.025*"approach" + 0.018*"theory" + 0.017*"process" + 0.016*"literature" + 0.015*"paper" + 0.013*"framework" + 0.011*"context"'),
 (1,
  '0.131*"child" + 0.087*"family" + 0.036*"parent" + 0.020*"person" + 0.017*"paper" + 0.017*"young" + 0.017*"foster" + 0.015*"system" + 0.014*"home" + 0.011*"protection"'),
 (2,
  '0.078*"health" + 0.031*"mental" + 0.031*"intervention" + 0.031*"treatment" + 0.029*"problem" + 0.024*"patient" + 0.021*"group" + 0.014*"stress" + 0.012*"clinical" + 0.011*"depression"'),
 (3,
  '0.179*"social" + 0.115*"worker" + 0.037*"professional" + 0.034*"association" + 0.033*"client" + 0.029*"national" + 0.024*"author" + 0.021*"role" + 0.018*"practice" + 0.018*"case"'),
 (4,
  '0.046*"experience" + 0.039*"support" + 0.037*"study" + 0.034*"relationship" + 0.026*"life" + 0.024*"finding" + 0.022*"interview" + 0.020*"qualitative" + 0.018*"mother" + 0.016*"participant"'),
 (5,
  '0.046*"program" + 0.038*"policy" +

In [24]:
import gensim

lda_model2 =  gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_mallet)
lda_model.save(join(const.MODELS_DIR, "lda-gensim-mallet"))

In [25]:
for c in lda_model2[corpus[:10]]:
    print(c)

[(0, 0.13504989613553214), (1, 0.05273579086061982), (2, 0.04626499125563798), (3, 0.07002296223732334), (4, 0.11703616593060984), (5, 0.07963551879947041), (6, 0.2166234454237747), (7, 0.045369252747541354), (8, 0.057581127639727586), (9, 0.17968084896976277)]
[(0, 0.23491948383744485), (1, 0.05747868739211423), (2, 0.0755166374983547), (3, 0.07819270085450113), (4, 0.07164600496158845), (5, 0.05875479305581082), (6, 0.17635485497610195), (7, 0.07216523987637131), (8, 0.11313611617230078), (9, 0.061835481375411804)]
[(0, 0.13031788910531275), (1, 0.053377475822071634), (2, 0.05139474155163718), (3, 0.06180866977125226), (4, 0.05756610098754765), (5, 0.08972946883739112), (6, 0.2766096046651034), (7, 0.06251548519038391), (8, 0.15164258599707378), (9, 0.06503797807222636)]
[(0, 0.12385509242088258), (1, 0.06790830131325393), (2, 0.10084338634870542), (3, 0.10142709683398934), (4, 0.0800534049714545), (5, 0.04823404653877157), (6, 0.23108691856713204), (7, 0.11602963383844402), (8, 0.08