In [2]:
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors

# Load and parse the data
data = sc.textFile("../spark/data/mllib/sample_lda_data.txt")
parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))
# Index documents with unique IDs
corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

# Cluster the documents into three topics using LDA
ldaModel = LDA.train(corpus, k=3)

# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):")
topics = ldaModel.topicsMatrix()
for topic in range(3):
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        print(" " + str(topics[word][topic]))
		
# Save and load model
ldaModel.save(sc, "myModelPath")
sameModel = LDAModel.load(sc, "myModelPath")

Learned topics (as distributions over vocab of 11 words):
Topic 0:
 4.24836273916
 10.9940049657
 2.93073243479
 17.5312296012
 4.9656599241
 2.28202279826
 12.3791330609
 1.70762915807
 4.53354242286
 14.0388408945
 12.2795276872
Topic 1:
 9.48618202868
 9.00112275964
 5.42156337187
 9.79756009919
 10.5584896311
 14.8385325843
 11.7591616338
 4.08371530515
 1.56831183324
 3.81156608184
 5.97742047833
Topic 2:
 12.2654552322
 9.00487227465
 3.64770419334
 12.6712102996
 9.47585044478
 4.87944461741
 6.86170530522
 4.20865553678
 1.8981457439
 6.14959302369
 14.7430518345
