Sample LDA model in pyspark. Clusters data into topics and maps the words associated with each topic.

In [2]:
dbutils.library.installPyPI('nltk', '3.4.3')
dbutils.library.installPyPI('xlrd')

In [3]:
import nltk
import re
from operator import attrgetter
from string import punctuation
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.sql.types import StringType, ArrayType, FloatType, IntegerType
from pyspark.sql.functions import array, concat, concat_ws, split, lit, col, array_max, array_position
from pyspark.ml.clustering import LDA

nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

In [4]:
# Text Parser {

def strip_digits(strval):
    return re.sub(r'\d+','',strval) 

def strip_punctuation(strval):
    punc_free = ''.join(' ' if char in punctuation else char
                        for char in strval)
    return punc_free
  
def tokenize_text(text):
    '''Tokenizes strings, removes stopwords and stems.'''
    
    digit_free = strip_digits(text)
    punc_free = strip_punctuation(digit_free)
    tokens = re.split('\W+', punc_free)
    result = list(
        filter(
            None, 
            [ps.stem(word).lower() for word in tokens if word not in stopwords]
        )
    )
    return result

# } Topic Extractor {

def indices_to_terms(vocabulary):
    def indices_to_terms(index, count):
        return [vocabulary[int(x)] for x in index][:count]
    return udf(indices_to_terms, ArrayType(StringType()))

# Create udf functions
StringArrayType = ArrayType(StringType(), False)
tokenize_udf = udf(tokenize_text, StringArrayType)
to_array = udf(lambda v: v.toArray().tolist(), ArrayType(FloatType()))
array_index = udf(lambda x,y: [i for i, e in enumerate(x) if e==y ][0])

In [5]:
df = pd.read_excel('http://help.incites.clarivate.com/incitesLiveESI/10678-TRS/version/default/part/AttachmentData/data/ESIMasterJournalList-122018.xlsx')

df = spark.createDataFrame(df)

In [6]:
display(df)

Full title,Title29,Title20,ISSN,EISSN,Category name
2D Materials,2D MATER,2D MATER,2053-1583,2053-1583,MATERIALS SCIENCE
3 Biotech,3 BIOTECH,3 BIOTECH,2190-572X,2190-5738,BIOLOGY & BIOCHEMISTRY
3D Printing and Additive Manufacturing,3D PRINT ADDIT MANUF,3D PRINT ADDIT MANUF,2329-7662,2329-7670,ENGINEERING
4OR-A Quarterly Journal of Operations Research,4OR Q J OPER RES,4OR-Q J OPER RES,1619-4500,1614-2411,ENGINEERING
AAPG BULLETIN,AAPG BULL,AAPG BULL,0149-1423,1558-9153,GEOSCIENCES
AAPS Journal,AAPS J,AAPS J,1550-7416,1550-7416,PHARMACOLOGY & TOXICOLOGY
AAPS PHARMSCITECH,AAPS PHARMSCITECH,AAPS PHARMSCITECH,1530-9932,1530-9932,PHARMACOLOGY & TOXICOLOGY
AATCC Journal of Research,AATCC J RES,AATCC J RES,2330-5517,2330-5517,MATERIALS SCIENCE
AATCC REVIEW,AATCC REV,AATCC REV,1532-8813,1532-8813,MATERIALS SCIENCE
Abacus-A Journal of Accounting Finance and Business Studies,ABACUS,ABACUS,0001-3072,1467-6281,ECONOMICS & BUSINESS


In [7]:
# Tokenize Text
df_token = df.select('Full title').withColumn("text_data", tokenize_udf('Full title'))

# Make tf-idf vectorizer
cv = CountVectorizer(
    inputCol="text_data",
    outputCol="count_vec", minDF=1
)
# transform the data
cvModel = cv.fit(df_token)
featurizedData = cvModel.transform(df_token)
idf = IDF(inputCol="count_vec", outputCol="features")

idfModel = idf.fit(featurizedData)
df_feature = idfModel.transform(featurizedData)

In [8]:
display(df_feature)

Full title,text_data,count_vec,features
2D Materials,"List(d, materi)","List(0, 5046, List(30, 237), List(1.0, 1.0))","List(0, 5046, List(30, 237), List(4.157566427474925, 6.072385989460208))"
3 Biotech,List(biotech),"List(0, 5046, List(3424), List(1.0))","List(0, 5046, List(3424), List(8.711443319075466))"
3D Printing and Additive Manufacturing,"List(d, print, addit, manufactur)","List(0, 5046, List(229, 237, 898, 4937), List(1.0, 1.0, 1.0, 1.0))","List(0, 5046, List(229, 237, 898, 4937), List(6.037294669648937, 6.072385989460208, 7.6128310304073565, 8.711443319075466))"
4OR-A Quarterly Journal of Operations Research,"List(or, a, quarterli, journal, oper, research)","List(0, 5046, List(0, 4, 48, 72, 189, 1847), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","List(0, 5046, List(0, 4, 48, 72, 189, 1847), List(1.095159757495081, 2.7440153497957254, 4.642416564837655, 4.927253685157205, 5.8782299750192495, 8.305978210967302))"
AAPG BULLETIN,"List(aapg, bulletin)","List(0, 5046, List(71, 3353), List(1.0, 1.0))","List(0, 5046, List(71, 3353), List(4.915954129903271, 8.711443319075466))"
AAPS Journal,"List(aap, journal)","List(0, 5046, List(0, 1659), List(1.0, 1.0))","List(0, 5046, List(0, 1659), List(1.095159757495081, 8.305978210967302))"
AAPS PHARMSCITECH,"List(aap, pharmscitech)","List(0, 5046, List(1659, 2473), List(1.0, 1.0))","List(0, 5046, List(1659, 2473), List(8.305978210967302, 8.711443319075466))"
AATCC Journal of Research,"List(aatcc, journal, research)","List(0, 5046, List(0, 4, 1853), List(1.0, 1.0, 1.0))","List(0, 5046, List(0, 4, 1853), List(1.095159757495081, 2.7440153497957254, 8.305978210967302))"
AATCC REVIEW,"List(aatcc, review)","List(0, 5046, List(6, 1853), List(1.0, 1.0))","List(0, 5046, List(6, 1853), List(3.1111712207889295, 8.305978210967302))"
Abacus-A Journal of Accounting Finance and Business Studies,"List(abacu, a, journal, account, financ, busi, studi)","List(0, 5046, List(0, 20, 48, 128, 183, 203, 4097), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","List(0, 5046, List(0, 20, 48, 128, 183, 203, 4097), List(1.095159757495081, 3.9323198259639365, 4.642416564837655, 5.472764866911086, 5.849242438145998, 5.970603295150265, 8.711443319075466))"


In [9]:
# Run the LDA model
lda = LDA(k=20, seed=1)
model = lda.fit(df_feature)

In [10]:
display(model.describeTopics(maxTermsPerTopic = 15))

topic,termIndices,termWeights
0,"List(13, 11, 2, 55, 36, 1, 0, 3, 74, 62, 103, 125, 132, 136, 4)","List(0.04199515518871151, 0.03587305980119948, 0.026596488205993682, 0.02264415666709514, 0.02249210246321813, 0.02193633448789811, 0.021143673840055853, 0.021030985349548056, 0.018816845307239663, 0.016591386727868022, 0.016042353567741814, 0.014178564901418427, 0.012888494542381682, 0.012479306324474347, 0.011765113294567807)"
1,"List(32, 192, 147, 160, 182, 241, 222, 271, 393, 392, 388, 489, 550, 610, 676)","List(0.019151421470002622, 0.01793097442136135, 0.01644002574953858, 0.014256205086896914, 0.012647504692184348, 0.011030645847428794, 0.010630435579199364, 0.009133408147185372, 0.00888783886712325, 0.008703753573913306, 0.007873110079985864, 0.007504881128669551, 0.0070561402167606604, 0.0065435061700046485, 0.006211264628168314)"
2,"List(89, 217, 283, 491, 419, 415, 479, 756, 495, 104, 1089, 334, 542, 0, 552)","List(0.029054654776641712, 0.01519663300967134, 0.009914790236659142, 0.00760470439826166, 0.0068783379459830965, 0.006493394545925606, 0.0064473187256148794, 0.0063642568068182906, 0.0060992635921508324, 0.005973132583118618, 0.0057645877559334745, 0.005369137693675509, 0.005335210357379694, 0.005157551247149432, 0.00515581142335343)"
3,"List(21, 60, 86, 107, 116, 122, 223, 1, 0, 138, 137, 157, 141, 129, 11)","List(0.03076579620847363, 0.023933443725850514, 0.020957899872071962, 0.01835686741469729, 0.01682401039038436, 0.01624385839490229, 0.015103819765574122, 0.014385292268656364, 0.01424548368417497, 0.014116985576675706, 0.014062617031470936, 0.01377737459160712, 0.012586359039586346, 0.0125105517625099, 0.012463767648609443)"
4,"List(45, 33, 79, 12, 180, 309, 205, 370, 242, 143, 239, 364, 252, 288, 44)","List(0.03475608067486217, 0.03254496729074966, 0.027852887478382268, 0.020322841872032173, 0.015254575706995226, 0.012919167926142948, 0.012028650244245006, 0.010819778441601648, 0.010742768646319092, 0.010727376141627217, 0.010436460514748022, 0.010229821739319104, 0.010217033845953705, 0.010137866362548686, 0.009392458811873368)"
5,"List(3, 6, 67, 65, 1, 2, 88, 0, 82, 39, 59, 121, 77, 90, 133)","List(0.05122744103490538, 0.04486432212146689, 0.01979097483154055, 0.018497047236765457, 0.01765961657065496, 0.017369798856529294, 0.017181491573584003, 0.01662401863401567, 0.016584162367381045, 0.014661390767049021, 0.014142149090156349, 0.013620401764160362, 0.013480535107805204, 0.013464974585353605, 0.012553105378119107)"
6,"List(24, 78, 148, 206, 276, 269, 369, 256, 232, 371, 312, 478, 810, 431, 416)","List(0.06356915401211553, 0.024584334454239208, 0.016597082628029974, 0.014685794512476582, 0.012378351017294611, 0.012374594507693789, 0.01231432956754058, 0.01180030869513492, 0.0102484324065431, 0.007977377031226588, 0.007736704165296041, 0.0076837005965182175, 0.007020638822929286, 0.006953712233083486, 0.006602606589440667)"
7,"List(19, 50, 145, 128, 152, 207, 228, 374, 23, 20, 285, 245, 439, 11, 490)","List(0.043280471691362406, 0.0326951321511889, 0.017478925582491048, 0.017197769862073754, 0.01483620944737719, 0.014413202934803856, 0.012594971131813988, 0.007816699658218452, 0.007568439357987127, 0.007520220677285692, 0.007496501015049976, 0.007385855763928701, 0.006750721892296024, 0.006443276775281885, 0.006369484217773779)"
8,"List(40, 51, 99, 110, 58, 96, 114, 209, 272, 265, 306, 202, 8, 0, 224)","List(0.03922442922625601, 0.02570451532644958, 0.02134661252143935, 0.02128289551044766, 0.0208048484986125, 0.016357387189123745, 0.014966233539403595, 0.013536389002676886, 0.013168124775741035, 0.012230243550369701, 0.011948781223482977, 0.011943035321514086, 0.011867469016837714, 0.010188220316566795, 0.008528173448444452)"
9,"List(46, 70, 118, 108, 186, 19, 310, 131, 200, 188, 456, 204, 353, 281, 339)","List(0.031026620958589938, 0.02958996713153813, 0.021807184293535016, 0.02050369316512996, 0.01668402317414741, 0.012093868914570873, 0.011126379473081746, 0.010553746748538586, 0.010530867554175153, 0.010401206481135492, 0.009978954794083898, 0.009277857260327203, 0.00905224577233498, 0.008711271917469229, 0.00732301145989523)"


In [11]:
# Map the term indices to the topic vocabulary (numbers to words).
df_terms = model.describeTopics(maxTermsPerTopic = 15).withColumn(
    "topic_words", indices_to_terms(cvModel.vocabulary)("termIndices", lit(5)))

In [12]:
display(df_terms)

topic,termIndices,termWeights,topic_words
0,"List(13, 11, 2, 55, 36, 1, 0, 3, 74, 62, 103, 125, 132, 136, 4)","List(0.04199515518871151, 0.03587305980119948, 0.026596488205993682, 0.02264415666709514, 0.02249210246321813, 0.02193633448789811, 0.021143673840055853, 0.021030985349548056, 0.018816845307239663, 0.016591386727868022, 0.016042353567741814, 0.014178564901418427, 0.012888494542381682, 0.012479306324474347, 0.011765113294567807)","List(technolog, clinic, and, cancer, environment)"
1,"List(32, 192, 147, 160, 182, 241, 222, 271, 393, 392, 388, 489, 550, 610, 676)","List(0.019151421470002622, 0.01793097442136135, 0.01644002574953858, 0.014256205086896914, 0.012647504692184348, 0.011030645847428794, 0.010630435579199364, 0.009133408147185372, 0.00888783886712325, 0.008703753573913306, 0.007873110079985864, 0.007504881128669551, 0.0070561402167606604, 0.0065435061700046485, 0.006211264628168314)","List(acta, pharmaceut, design, famili, e)"
2,"List(89, 217, 283, 491, 419, 415, 479, 756, 495, 104, 1089, 334, 542, 0, 552)","List(0.029054654776641712, 0.01519663300967134, 0.009914790236659142, 0.00760470439826166, 0.0068783379459830965, 0.006493394545925606, 0.0064473187256148794, 0.0063642568068182906, 0.0060992635921508324, 0.005973132583118618, 0.0057645877559334745, 0.005369137693675509, 0.005335210357379694, 0.005157551247149432, 0.00515581142335343)","List(anim, data, orthopaed, methodolog, decis)"
3,"List(21, 60, 86, 107, 116, 122, 223, 1, 0, 138, 137, 157, 141, 129, 11)","List(0.03076579620847363, 0.023933443725850514, 0.020957899872071962, 0.01835686741469729, 0.01682401039038436, 0.01624385839490229, 0.015103819765574122, 0.014385292268656364, 0.01424548368417497, 0.014116985576675706, 0.014062617031470936, 0.01377737459160712, 0.012586359039586346, 0.0125105517625099, 0.012463767648609443)","List(american, practic, psychiatri, fur, industri)"
4,"List(45, 33, 79, 12, 180, 309, 205, 370, 242, 143, 239, 364, 252, 288, 44)","List(0.03475608067486217, 0.03254496729074966, 0.027852887478382268, 0.020322841872032173, 0.015254575706995226, 0.012919167926142948, 0.012028650244245006, 0.010819778441601648, 0.010742768646319092, 0.010727376141627217, 0.010436460514748022, 0.010229821739319104, 0.010217033845953705, 0.010137866362548686, 0.009392458811873368)","List(annal, advanc, archiv, health, seminar)"
5,"List(3, 6, 67, 65, 1, 2, 88, 0, 82, 39, 59, 121, 77, 90, 133)","List(0.05122744103490538, 0.04486432212146689, 0.01979097483154055, 0.018497047236765457, 0.01765961657065496, 0.017369798856529294, 0.017181491573584003, 0.01662401863401567, 0.016584162367381045, 0.014661390767049021, 0.014142149090156349, 0.013620401764160362, 0.013480535107805204, 0.013464974585353605, 0.012553105378119107)","List(scienc, review, nurs, plant, of)"
6,"List(24, 78, 148, 206, 276, 269, 369, 256, 232, 371, 312, 478, 810, 431, 416)","List(0.06356915401211553, 0.024584334454239208, 0.016597082628029974, 0.014685794512476582, 0.012378351017294611, 0.012374594507693789, 0.01231432956754058, 0.01180030869513492, 0.0102484324065431, 0.007977377031226588, 0.007736704165296041, 0.0076837005965182175, 0.007020638822929286, 0.006953712233083486, 0.006602606589440667)","List(mathemat, oncolog, network, radiolog, intervent)"
7,"List(19, 50, 145, 128, 152, 207, 228, 374, 23, 20, 285, 245, 439, 11, 490)","List(0.043280471691362406, 0.0326951321511889, 0.017478925582491048, 0.017197769862073754, 0.01483620944737719, 0.014413202934803856, 0.012594971131813988, 0.007816699658218452, 0.007568439357987127, 0.007520220677285692, 0.007496501015049976, 0.007385855763928701, 0.006750721892296024, 0.006443276775281885, 0.006369484217773779)","List(de, revista, water, busi, therapeut)"
8,"List(40, 51, 99, 110, 58, 96, 114, 209, 272, 265, 306, 202, 8, 0, 224)","List(0.03922442922625601, 0.02570451532644958, 0.02134661252143935, 0.02128289551044766, 0.0208048484986125, 0.016357387189123745, 0.014966233539403595, 0.013536389002676886, 0.013168124775741035, 0.012230243550369701, 0.011948781223482977, 0.011943035321514086, 0.011867469016837714, 0.010188220316566795, 0.008528173448444452)","List(current, analysi, statist, opinion, theori)"
9,"List(46, 70, 118, 108, 186, 19, 310, 131, 200, 188, 456, 204, 353, 281, 339)","List(0.031026620958589938, 0.02958996713153813, 0.021807184293535016, 0.02050369316512996, 0.01668402317414741, 0.012093868914570873, 0.011126379473081746, 0.010553746748538586, 0.010530867554175153, 0.010401206481135492, 0.009978954794083898, 0.009277857260327203, 0.00905224577233498, 0.008711271917469229, 0.00732301145989523)","List(polici, natur, asian, control, age)"


In [13]:
# Map the topic number to the highest probability 
df_model = model.transform(df_feature).withColumn('test', to_array('topicDistribution')).withColumn('topic', array_index('test', array_max('test')))

In [14]:
display(df_model)

Full title,text_data,count_vec,features,topicDistribution,test,topic
2D Materials,"List(d, materi)","List(0, 5046, List(30, 237), List(1.0, 1.0))","List(0, 5046, List(30, 237), List(4.157566427474925, 6.072385989460208))","List(1, 20, List(), List(0.004421178495617607, 0.0041673156603260555, 0.004160108827052079, 0.004250557213933016, 0.004238986746909256, 0.004473874688615361, 0.004202424217623583, 0.0041983547246676155, 0.0042551825421195295, 0.004228454449771209, 0.00418218615297017, 0.00416188740018979, 0.004303297922962175, 0.3781104377125359, 0.004184632827579281, 0.004406536521742765, 0.004160932427570098, 0.004335598039646946, 0.0045506637803233285, 0.5450073896478441))","List(0.0044211787, 0.0041673156, 0.004160109, 0.004250557, 0.004238987, 0.004473875, 0.004202424, 0.0041983547, 0.0042551826, 0.0042284545, 0.004182186, 0.0041618873, 0.004303298, 0.37811044, 0.004184633, 0.0044065365, 0.0041609323, 0.004335598, 0.0045506638, 0.5450074)",19
3 Biotech,List(biotech),"List(0, 5046, List(3424), List(1.0))","List(0, 5046, List(3424), List(8.711443319075466))","List(1, 20, List(), List(0.005115224477905173, 0.00482150969925339, 0.004813171523513931, 0.0049178186840216406, 0.00490443186609584, 0.005176193041790304, 0.00486212968181345, 0.004857421345141524, 0.004923170121857612, 0.004892246195214701, 0.0048387146056932, 0.905396457856049, 0.0049788387616146795, 0.0052799868571558415, 0.004841545367725808, 0.00509828398931448, 0.00481412446135931, 0.005016209426022949, 0.005265036597022203, 0.00518748544143483))","List(0.0051152245, 0.0048215096, 0.0048131715, 0.0049178186, 0.0049044318, 0.005176193, 0.0048621297, 0.0048574214, 0.00492317, 0.0048922463, 0.0048387144, 0.90539646, 0.004978839, 0.0052799867, 0.0048415456, 0.0050982838, 0.0048141247, 0.0050162096, 0.0052650366, 0.0051874854)",11
3D Printing and Additive Manufacturing,"List(d, print, addit, manufactur)","List(0, 5046, List(229, 237, 898, 4937), List(1.0, 1.0, 1.0, 1.0))","List(0, 5046, List(229, 237, 898, 4937), List(6.037294669648937, 6.072385989460208, 7.6128310304073565, 8.711443319075466))","List(1, 20, List(), List(0.001683251099119389, 0.001586599294858081, 0.0015838554768184516, 0.0016182913963528126, 0.001613886235770176, 0.001703313837168443, 0.0015999659840316696, 0.001598416628193996, 0.001620052370429963, 0.0016098763319934094, 0.0015922608563428268, 0.0015845326231799272, 0.0016383710780745644, 0.0017374689389651002, 0.001593192365447714, 0.0016776765451843021, 0.001584169041794562, 0.0016506685250140722, 0.0017325493096761967, 0.9689916020615843))","List(0.0016832511, 0.0015865993, 0.0015838555, 0.0016182914, 0.0016138862, 0.0017033138, 0.001599966, 0.0015984166, 0.0016200524, 0.0016098763, 0.0015922609, 0.0015845327, 0.001638371, 0.001737469, 0.0015931923, 0.0016776766, 0.001584169, 0.0016506686, 0.0017325493, 0.9689916)",19
4OR-A Quarterly Journal of Operations Research,"List(or, a, quarterli, journal, oper, research)","List(0, 5046, List(0, 4, 48, 72, 189, 1847), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","List(0, 5046, List(0, 4, 48, 72, 189, 1847), List(1.095159757495081, 2.7440153497957254, 4.642416564837655, 4.927253685157205, 5.8782299750192495, 8.305978210967302))","List(1, 20, List(), List(0.0017328207422928732, 0.2907921295011388, 0.0016304980370044062, 0.0016659480518534317, 0.0016614131650806345, 0.001753474301406401, 0.0016470829784116985, 0.0016454879961665147, 0.001667760885114486, 0.001657285173668436, 0.001639150944660376, 0.0016311951242873055, 0.0016866190545488803, 0.24792700933540154, 0.0016401098853493488, 0.0017270820216873245, 0.0016308208361358856, 0.0016992786488743575, 0.24758040591232847, 0.1869844274045889))","List(0.0017328208, 0.29079214, 0.001630498, 0.0016659481, 0.0016614131, 0.0017534744, 0.001647083, 0.001645488, 0.0016677609, 0.0016572851, 0.0016391509, 0.0016311951, 0.001686619, 0.24792701, 0.0016401099, 0.0017270821, 0.0016308208, 0.0016992786, 0.24758041, 0.18698443)",1
AAPG BULLETIN,"List(aapg, bulletin)","List(0, 5046, List(71, 3353), List(1.0, 1.0))","List(0, 5046, List(71, 3353), List(4.915954129903271, 8.711443319075466))","List(1, 20, List(), List(0.0033915932046175447, 0.00319684887692307, 0.003191320340926622, 0.003260705491419253, 0.0032518295058633056, 0.9375134968242073, 0.0032237815032379598, 0.0032206596961843557, 0.0032642536891834225, 0.003243749924823062, 0.0032082563921335054, 0.003192684727289814, 0.003301164168601891, 0.0035008370712338844, 0.003210133297215971, 0.003380361000177002, 0.003191952145169448, 0.003325942370327451, 0.0034909247283496014, 0.0034395050421156368))","List(0.0033915932, 0.0031968488, 0.0031913202, 0.0032607054, 0.0032518294, 0.9375135, 0.0032237815, 0.0032206597, 0.0032642537, 0.00324375, 0.0032082563, 0.0031926846, 0.0033011641, 0.0035008371, 0.0032101334, 0.003380361, 0.0031919521, 0.0033259424, 0.0034909248, 0.0034395051)",5
AAPS Journal,"List(aap, journal)","List(0, 5046, List(0, 1659), List(1.0, 1.0))","List(0, 5046, List(0, 1659), List(1.095159757495081, 8.305978210967302))","List(1, 20, List(), List(0.004774782401108591, 0.004500615709364067, 0.004492832477576242, 0.00459051489324932, 0.004578019018153833, 0.004831693225005389, 0.004538532235205883, 0.004534137265415102, 0.004595510145950791, 0.004566644352853636, 0.9117146877101703, 0.004494753299344537, 0.004647473778019098, 0.004928579092397708, 0.004519317897536702, 0.004758969382523761, 0.004493721949772583, 0.004682357247557414, 0.00491462384491474, 0.0048422340738801944))","List(0.0047747823, 0.004500616, 0.0044928323, 0.004590515, 0.004578019, 0.004831693, 0.0045385323, 0.0045341374, 0.00459551, 0.0045666443, 0.9117147, 0.004494753, 0.004647474, 0.004928579, 0.0045193178, 0.0047589694, 0.0044937218, 0.0046823574, 0.0049146237, 0.004842234)",10
AAPS PHARMSCITECH,"List(aap, pharmscitech)","List(0, 5046, List(1659, 2473), List(1.0, 1.0))","List(0, 5046, List(1659, 2473), List(8.305978210967302, 8.711443319075466))","List(1, 20, List(), List(0.0027520965492298945, 0.002594071937647191, 0.0025895858814866586, 0.002645888170919189, 0.002638685785255934, 0.002784898916119158, 0.0026159263293452756, 0.002613393144002779, 0.0026487673438309743, 0.002632129637534306, 0.9491139733031501, 0.0025906929548655954, 0.0026787182255363895, 0.0028407421271809718, 0.0026048515367134216, 0.0027429822270641597, 0.0025900985022853634, 0.0026988244152643384, 0.002832698567588615, 0.0027909744449797383))","List(0.0027520966, 0.002594072, 0.002589586, 0.0026458881, 0.0026386857, 0.002784899, 0.0026159263, 0.0026133931, 0.0026487673, 0.0026321297, 0.94911397, 0.002590693, 0.0026787182, 0.0028407422, 0.0026048515, 0.0027429822, 0.0025900984, 0.0026988245, 0.0028326986, 0.0027909745)",10
AATCC Journal of Research,"List(aatcc, journal, research)","List(0, 5046, List(0, 4, 1853), List(1.0, 1.0, 1.0))","List(0, 5046, List(0, 4, 1853), List(1.095159757495081, 2.7440153497957254, 8.305978210967302))","List(1, 20, List(), List(0.003775146217217353, 0.0035583783445870265, 0.0035522245893525243, 0.0036294564649501644, 0.003619576704053768, 0.0038201423222796508, 0.003588356761417358, 0.0035848819116176545, 0.0036334059278407445, 0.003610583398953163, 0.0035710759279927876, 0.003553743272047973, 0.003674490580011762, 0.0038967452822348037, 0.0035731650883441394, 0.9303894412425391, 0.0035529278450220554, 0.00370207094569577, 0.0038857108246550497, 0.003828476349187125))","List(0.0037751463, 0.0035583784, 0.0035522245, 0.0036294565, 0.0036195768, 0.0038201422, 0.0035883568, 0.003584882, 0.003633406, 0.0036105835, 0.0035710759, 0.0035537432, 0.0036744906, 0.0038967454, 0.003573165, 0.93038946, 0.0035529279, 0.003702071, 0.0038857108, 0.0038284764)",15
AATCC REVIEW,"List(aatcc, review)","List(0, 5046, List(6, 1853), List(1.0, 1.0))","List(0, 5046, List(6, 1853), List(3.1111712207889295, 8.305978210967302))","List(1, 20, List(), List(0.003997163822228351, 0.003767647801168517, 0.003761132142111864, 0.003842906044470136, 0.003832445248120639, 0.00404480629946974, 0.0037993892604773387, 0.0037957100535705654, 0.0038470877751292888, 0.0038229230543441223, 0.0037810921348243767, 0.003762740140070007, 0.003890588638031995, 0.004125913233519571, 0.003783304159808825, 0.9262956213527066, 0.00376187675502603, 0.003919791001126379, 0.004114230738503625, 0.004053630345291869))","List(0.003997164, 0.0037676478, 0.0037611322, 0.003842906, 0.0038324452, 0.004044806, 0.0037993893, 0.0037957102, 0.0038470877, 0.003822923, 0.0037810921, 0.0037627402, 0.0038905886, 0.004125913, 0.0037833042, 0.92629564, 0.0037618768, 0.003919791, 0.0041142306, 0.0040536304)",15
Abacus-A Journal of Accounting Finance and Business Studies,"List(abacu, a, journal, account, financ, busi, studi)","List(0, 5046, List(0, 20, 48, 128, 183, 203, 4097), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","List(0, 5046, List(0, 20, 48, 128, 183, 203, 4097), List(1.095159757495081, 3.9323198259639365, 4.642416564837655, 5.472764866911086, 5.849242438145998, 5.970603295150265, 8.711443319075466))","List(1, 20, List(), List(0.001350604278841536, 0.001273052959361782, 0.0012708513790235913, 0.0012984820165455994, 0.0012949474104523282, 0.0013667021857646832, 0.0012837781015896453, 0.6858276835553105, 0.0012998949838067548, 0.0012917299503601336, 0.0012775956739391019, 0.0012713947082608108, 0.0013145935185896905, 0.290598807183597, 0.001278343096525014, 0.0013461313913691595, 0.0012711029767077378, 0.0013244607240794663, 0.0013901601175429939, 0.001369683788332579))","List(0.0013506042, 0.001273053, 0.0012708514, 0.0012984821, 0.0012949475, 0.0013667021, 0.0012837781, 0.6858277, 0.001299895, 0.00129173, 0.0012775956, 0.0012713947, 0.0013145935, 0.2905988, 0.0012783431, 0.0013461314, 0.0012711029, 0.0013244607, 0.0013901601, 0.0013696838)",7


In [15]:
display(df_model.join(df_terms, ['topic']).select('topic', 'Full title', 'topic_words'))

topic,Full title,topic_words
19,2D Materials,"List(appli, econom, journal, inform, of)"
11,3 Biotech,"List(environ, work, associ, disord, ac)"
19,3D Printing and Additive Manufacturing,"List(appli, econom, journal, inform, of)"
1,4OR-A Quarterly Journal of Operations Research,"List(acta, pharmaceut, design, famili, e)"
5,AAPG BULLETIN,"List(scienc, review, nurs, plant, of)"
10,AAPS Journal,"List(method, pharmacolog, conserv, china, univers)"
10,AAPS PHARMSCITECH,"List(method, pharmacolog, conserv, china, univers)"
15,AATCC Journal of Research,"List(intern, studi, law, languag, polit)"
15,AATCC REVIEW,"List(intern, studi, law, languag, polit)"
7,Abacus-A Journal of Accounting Finance and Business Studies,"List(de, revista, water, busi, therapeut)"
