In [1]:
!pip3 install nltk

[33mYou are using pip version 8.1.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from nltk.corpus import stopwords

In [4]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StringType, ArrayType
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml.clustering import LDA

In [5]:
N_TOPICS = 20
MAX_TERMS = 5

In [6]:
stopwords = set(stopwords.words('english')).union({
    'introduction', 'edition', 'series', 'application',
    'approach', 'card', 'access', 'package', 'plus', 'etext',
    'brief', 'vol', 'fundamental', 'guide', 'essential', 'printed',
    'third', 'second', 'fourth'})

sc = SparkContext('local', 'nlp')
lines = sc.textFile('all_book_titles.txt')

In [7]:
lines = lines \
    .map(lambda line: line.strip().lower()) \
    .map(lambda line: line.split()) \
    .map(lambda words: [w for w in words if w.isalpha()]) \
    .map(lambda words: [w for w in words if len(w) > 3]) \
    .map(lambda words: [w for w in words if w not in stopwords]) \
    .zipWithIndex()

In [9]:
sess = SparkSession.builder.appName('nlp').getOrCreate()
df = sess.createDataFrame(lines, ['words', 'idx'])

In [11]:
df.head(5)

[Row(words=['philosophy', 'love', 'reader'], idx=0),
 Row(words=['readings', 'islam'], idx=1),
 Row(words=['microprocessors', 'principles', 'applications'], idx=2),
 Row(words=['bernhard', 'edouard', 'story', 'north', 'american', 'forestry'], idx=3),
 Row(words=['encyclopedia', 'buddhism'], idx=4)]

In [12]:
cv = CountVectorizer(inputCol='words',
                     outputCol='tf')
cv = cv.fit(df)
df = cv.transform(df)
df = IDF(inputCol='tf',
         outputCol='tfidf').fit(df).transform(df)

lda = LDA(k=N_TOPICS,
          featuresCol='tfidf',
          optimizer='em').fit(df)

In [13]:
for i, indices in enumerate(lda.describeTopics(MAX_TERMS).toPandas().termIndices):
    print('Topic %d:'%(i+1), ' '.join([cv.vocabulary[idx] for idx in indices]))

Topic 1: probability statistics physics engineers game
Topic 2: accounting ethics practice studies advanced
Topic 3: language natural processing care information
Topic 4: perspective theater text essentials states
Topic 5: computer political organization architecture society
Topic 6: physical philosophy pharmacology readings nursing
Topic 7: understanding study pathophysiology literature judaism
Topic 8: anthropology geology version programming using
Topic 9: basic history evolution feminism concise
Topic 10: life science handbook practical medicine
Topic 11: history foundations sociology ecology earth
Topic 12: methods global learning marketing research
Topic 13: design real laboratory analysis finance
Topic 14: financial business security managerial relativity
Topic 15: mechanics engineering international structures quantum
Topic 16: manual student actuarial american solutions
Topic 17: buddhism writing networks microprocessor microbiology
Topic 18: asian east early molecular social
