In [1]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.7/spark-2.4.7-bin-hadoop2.7.tgz
!tar xf spark-2.4.7-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.7-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu bionic InRelease
0% [Waiting for headers] [Connecting to cloud.r-project.org (13.227.219.25)] [W                                                                               Get:2 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
                                                                               Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
                                                                               Get:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
                                                                               Hit:5 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
0% [2 InRelease 47.5 kB/88.7 kB 54%] [Waiting for headers] [Connecting to ppa.l0% [1 InRelease gpgv 242 kB] [2 InRelease 47.5 kB/88.7 kB 54%] [Waiting for hea                                                

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
# Start a SparkSession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("word2vec").getOrCreate()

In [4]:
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.ml.feature import Word2Vec
import string


In [5]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url ="/content/gdrive/MyDrive/Project3_BeerQuality/Resources/reviews_beer_brewery.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("reviews_beer_brewery.csv"), sep=",", header=True)

# Show DataFrame
df.show()

+-------+----------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+----+----------+--------------------+---------------+-------------+-----------------+
|beer_id|        username|      date|                text|                look|               smell|               taste|                feel|             overall|               score|           beer_name|review_state|               style|        availability| abv|brewery_id|        brewery_name|   brewery_city|brewery_state|    brewery_types|
+-------+----------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+----+----------+--------------------+-----------

In [6]:
#df.show(truncate=False)
text_df = df.select('text')
text_df.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [7]:
def remove_punctuation(txt):
  txt = txt.strip('\xa0\xa0 ')
  return "".join(l if l not in string.punctuation else "" for l in txt)

In [8]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import col, udf

remove_punctuation_udf = udf(remove_punctuation, StringType())
remove_punctuation_udf

<function __main__.remove_punctuation>

In [9]:
text_df = text_df.withColumn("clean_text", remove_punctuation_udf(col("text")))
text_df.show()

+--------------------+--------------------+
|                text|          clean_text|
+--------------------+--------------------+
|   750 ml bottle,...|750 ml bottle 201...|
|   Clear gold in ...|Clear gold in col...|
|   Reminds me of ...|Reminds me of pre...|
|   Pale yellow bo...|Pale yellow body ...|
|   The CANQuest (...|The CANQuest tm i...|
|   I got a Paybac...|I got a Payback P...|
|"   Can with ""11...|   Can with 11111...|
|   Enjoyed from t...|Enjoyed from the ...|
|   Pours a golden...|Pours a golden ye...|
|   This thing is ...|This thing is gor...|
|                  0%|                   0|
|"   16oz draft A:...|   16oz draft A D...|
|   The chipotle p...|The chipotle pepp...|
|   I want this po...|I want this porte...|
|   12 FL. OZ. bot...|12 FL OZ bottle S...|
|   pretty good fo...|pretty good for i...|
|   I bought a 6pc...|I bought a 6pck a...|
|   Pours a medium...|Pours a medium br...|
|   12 ounce can i...|12 ounce can into...|
|   Porter first a...|Porter fir

In [10]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.ml import Pipeline

# "Creating pipeline..."
tokenizer = Tokenizer(inputCol="clean_text", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text', outputCol='features')

pipeline = Pipeline(stages=[tokenizer, stopremove])

# "Training model..."
pipeline_stg = pipeline.fit(text_df)
final_df = pipeline_stg.transform(text_df)
# final_df.show()

In [11]:
final_df.show()

+--------------------+--------------------+--------------------+--------------------+
|                text|          clean_text|          token_text|            features|
+--------------------+--------------------+--------------------+--------------------+
|   750 ml bottle,...|750 ml bottle 201...|[750, ml, bottle,...|[750, ml, bottle,...|
|   Clear gold in ...|Clear gold in col...|[clear, gold, in,...|[clear, gold, col...|
|   Reminds me of ...|Reminds me of pre...|[reminds, me, of,...|[reminds, pretty,...|
|   Pale yellow bo...|Pale yellow body ...|[pale, yellow, bo...|[pale, yellow, bo...|
|   The CANQuest (...|The CANQuest tm i...|[the, canquest, t...|[canquest, tm, ho...|
|   I got a Paybac...|I got a Payback P...|[i, got, a, payba...|[got, payback, pi...|
|"   Can with ""11...|   Can with 11111...|[  , can, with, 1...|[  , 1111161211, ...|
|   Enjoyed from t...|Enjoyed from the ...|[enjoyed, from, t...|[enjoyed, 12, oz,...|
|   Pours a golden...|Pours a golden ye...|[pours, a, 

In [12]:
word2vec = Word2Vec(
    vectorSize=100,
    seed=42,
    inputCol="features",
    outputCol="model"
).setMaxIter(2)

In [13]:
model = word2vec.fit(final_df)

In [14]:
model.getVectors().show()

+----------------+--------------------+
|            word|              vector|
+----------------+--------------------+
|           mells|[0.11464476585388...|
|         tasties|[-0.0207345690578...|
|sidebysidebyside|[-0.1547902226448...|
|          profle|[-0.1289321035146...|
|       professed|[0.04754043370485...|
|           31211|[-0.0228820666670...|
|      machfive55|[-0.0529651381075...|
|          300day|[-0.1796194314956...|
|      iteresting|[0.01549587678164...|
|           chary|[-0.1552485525608...|
|      nonfruited|[-0.0256271287798...|
|   eyewateringly|[0.05206718668341...|
|           motts|[0.01467642001807...|
| appearancesmell|[0.01505822129547...|
|        northend|[0.04490714147686...|
|     pepperlemon|[0.01769251562654...|
|        likejust|[-0.0436878018081...|
|        quotient|[-0.1344485729932...|
|        fiziness|[0.04230001941323...|
|     herballemon|[-0.0435456186532...|
+----------------+--------------------+
only showing top 20 rows



In [18]:
model.findSynonymsArray("malty", 5)

[('maltly', 0.7657453417778015),
 ('maltiness', 0.740445077419281),
 ('matly', 0.7303434014320374),
 ('nutty', 0.6969457864761353),
 ('maltyness', 0.6947509050369263)]