In [1]:
!pip3 install nltk

[33mYou are using pip version 8.1.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from nltk.corpus import stopwords

In [4]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StringType, ArrayType
from pyspark.ml.feature import Word2Vec
from operator import add

In [5]:
stopwords = set(stopwords.words('english')).union({
    'introduction', 'edition', 'series', 'application',
    'approach', 'card', 'access', 'package', 'plus', 'etext',
    'brief', 'vol', 'fundamental', 'guide', 'essential', 'printed',
    'third', 'second', 'fourth'})

sc = SparkContext('local', 'nlp')
lines = sc.textFile('all_book_titles.txt')

In [6]:
lines = lines \
    .map(lambda line: line.strip().lower()) \
    .map(lambda line: line.split()) \
    .map(lambda words: [w for w in words if w.isalpha()]) \
    .map(lambda words: [w for w in words if len(w) > 3]) \
    .map(lambda words: [w for w in words if w not in stopwords]).reduce(add)

In [7]:
sess = SparkSession.builder.appName('nlp').getOrCreate()

In [8]:
df = sess.createDataFrame([(lines,), (lines,)], ["sentence"])

In [9]:
df.show()

+--------------------+
|            sentence|
+--------------------+
|[philosophy, love...|
|[philosophy, love...|
+--------------------+



In [10]:
word2Vec = Word2Vec(vectorSize=64, inputCol="sentence", outputCol="model")

In [11]:
model = word2Vec.fit(df)

In [12]:
model.getVectors().show()

+------------+--------------------+
|        word|              vector|
+------------+--------------------+
|    embedded|[0.02133640460669...|
|    feminism|[0.03890470042824...|
|       unity|[-0.0120630841702...|
|  conceptual|[-0.0345687717199...|
|   reference|[-0.0031733114738...|
|    workbook|[0.01257053948938...|
|     writing|[-0.0387185476720...|
|    elements|[-0.0082433093339...|
|    discrete|[0.01831728033721...|
|    semester|[-0.0221984051167...|
|     measure|[0.02612338773906...|
|      health|[0.09340767562389...|
|     statics|[-0.0189717188477...|
|  perceptive|[-0.0406614504754...|
|organization|[0.19586075842380...|
|    database|[0.05999059230089...|
|       moral|[9.99550218693912...|
|     nursing|[0.11989702284336...|
|    politics|[-0.0118664680048...|
|    eleventh|[-0.0475022606551...|
+------------+--------------------+
only showing top 20 rows



In [13]:
from pyspark.sql.functions import format_number as fmt

In [14]:
model.findSynonyms("politics", 20).select("word", fmt("similarity", 5).alias("similarity")).show()

+---------+----------+
|     word|similarity|
+---------+----------+
| american|   0.75948|
|   survey|   0.73565|
|    latin|   0.70667|
|  society|   0.69553|
|democracy|   0.69158|
| diaspora|   0.68637|
| ultimate|   0.68329|
|  america|   0.68231|
|paperback|   0.68097|
|     west|   0.68061|
|  history|   0.67355|
|political|   0.66569|
| critical|   0.66167|
|sociology|   0.64909|
|  matters|   0.63970|
|   source|   0.62940|
|documents|   0.62534|
|  concise|   0.61480|
|    blood|   0.61417|
|     exam|   0.61149|
+---------+----------+

