In [1]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.7/spark-2.4.7-bin-hadoop2.7.tgz
!tar xf spark-2.4.7-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.7-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:3 http://security.ubuntu.com/ubuntu bionic-security InRelease
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:8 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:10 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:12 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Hit:13 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:14 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:15 http://ppa.launchpad.net/graphic

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
# Start a SparkSession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("word2vec").getOrCreate()

In [4]:
# Import needed libs
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.ml.feature import Word2Vec, Word2VecModel
import string


In [5]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url ="/content/gdrive/MyDrive/Project3_BeerQuality/Resources/reviews_beer_brewery.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("reviews_beer_brewery.csv"), header=True)

# Show DataFrame
df.show()

+-------+----------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+----+----------+--------------------+---------------+-------------+-----------------+
|beer_id|        username|      date|                text|                look|               smell|               taste|                feel|             overall|               score|           beer_name|review_state|               style|        availability| abv|brewery_id|        brewery_name|   brewery_city|brewery_state|    brewery_types|
+-------+----------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+----+----------+--------------------+-----------

In [6]:
# Check the schema for numeric conversion
df.printSchema

<bound method DataFrame.printSchema of DataFrame[beer_id: string, username: string, date: string, text: string, look: string, smell: string, taste: string, feel: string, overall: string, score: string, beer_name: string, review_state: string, style: string, availability: string, abv: string, brewery_id: string, brewery_name: string, brewery_city: string, brewery_state: string, brewery_types: string]>

In [7]:
df.count()

2127677

In [8]:
# remove alpha characters from data set on review numerics
from pyspark.sql.functions import regexp_extract, col
pattern = r'^\d+.\d+$'
df = df.filter(regexp_extract(col('look'), pattern, 0) != '')

In [9]:
# confirm look, smell, taste, overall, and score
df.show()

+-------+----------------+----------+--------------------+----+-----+-----+----+-------+-----+--------------------+------------+--------------------+--------------------+----+----------+--------------------+------------+-------------+-------------+
|beer_id|        username|      date|                text|look|smell|taste|feel|overall|score|           beer_name|review_state|               style|        availability| abv|brewery_id|        brewery_name|brewery_city|brewery_state|brewery_types|
+-------+----------------+----------+--------------------+----+-----+-----+----+-------+-----+--------------------+------------+--------------------+--------------------+----+----------+--------------------+------------+-------------+-------------+
| 271781|    bluejacket74|2017-03-17|   750 ml bottle,...| 4.0|  4.0|  4.0|4.25|    4.0| 4.03|Motorbreath Imper...|          OH|American Imperial...|Limited (brewed o...|10.8|     28094|Four String Brewi...|    Columbus|           OH| Brewery, Bar|
| 18

In [10]:
df.count()

1973277

In [11]:
# Next 6 lines casts to floats all the review ratings
df = df.withColumn('look_numeric', df['look'].cast('float'))

In [12]:
df = df.withColumn('smell_numeric', df['smell'].cast('float'))

In [13]:
df = df.withColumn('taste_numeric', df['taste'].cast('float'))

In [14]:
df = df.withColumn('feel_numeric', df['feel'].cast('float'))

In [15]:
df = df.withColumn('overall_numeric', df['overall'].cast('float'))

In [16]:
df = df.withColumn('score_numeric', df['score'].cast('float'))

In [17]:
# Confirm data types
df.printSchema

<bound method DataFrame.printSchema of DataFrame[beer_id: string, username: string, date: string, text: string, look: string, smell: string, taste: string, feel: string, overall: string, score: string, beer_name: string, review_state: string, style: string, availability: string, abv: string, brewery_id: string, brewery_name: string, brewery_city: string, brewery_state: string, brewery_types: string, look_numeric: float, smell_numeric: float, taste_numeric: float, feel_numeric: float, overall_numeric: float, score_numeric: float]>

In [18]:
# functions for manipulating review text into a format we can tokenize
def remove_punctuation(txt):
  txt = txt.strip('\xa0\xa0 ')
  return "".join(l if l not in string.punctuation else "" for l in txt)

def remove_space (text):
  if text:
    return text.replace(' ', '')
  else:
    return 'Unknown'
  

In [19]:
# define the udfs for the data
from pyspark.sql.types import StringType
from pyspark.sql.functions import col, udf

remove_punctuation_udf = udf(remove_punctuation, StringType())
remove_punctuation_udf

remove_space_udf = udf(remove_space, StringType())
remove_space_udf

<function __main__.remove_space>

In [20]:
# clean the beer name by removing spaces (will be combined with text)
beer_df = df.withColumn('beer_clean', remove_space_udf(col('beer_name')))
beer_df.show()

+-------+----------------+----------+--------------------+----+-----+-----+----+-------+-----+--------------------+------------+--------------------+--------------------+----+----------+--------------------+------------+-------------+-------------+------------+-------------+-------------+------------+---------------+-------------+--------------------+
|beer_id|        username|      date|                text|look|smell|taste|feel|overall|score|           beer_name|review_state|               style|        availability| abv|brewery_id|        brewery_name|brewery_city|brewery_state|brewery_types|look_numeric|smell_numeric|taste_numeric|feel_numeric|overall_numeric|score_numeric|          beer_clean|
+-------+----------------+----------+--------------------+----+-----+-----+----+-------+-----+--------------------+------------+--------------------+--------------------+----+----------+--------------------+------------+-------------+-------------+------------+-------------+-------------+---

In [21]:
beer_df.count()

1973277

In [22]:
# clean the style name by removing spaces (will be combined with text)
style_df = beer_df.withColumn('style_clean', remove_space_udf(col('style')))
style_df.show()

+-------+----------------+----------+--------------------+----+-----+-----+----+-------+-----+--------------------+------------+--------------------+--------------------+----+----------+--------------------+------------+-------------+-------------+------------+-------------+-------------+------------+---------------+-------------+--------------------+--------------------+
|beer_id|        username|      date|                text|look|smell|taste|feel|overall|score|           beer_name|review_state|               style|        availability| abv|brewery_id|        brewery_name|brewery_city|brewery_state|brewery_types|look_numeric|smell_numeric|taste_numeric|feel_numeric|overall_numeric|score_numeric|          beer_clean|         style_clean|
+-------+----------------+----------+--------------------+----+-----+-----+----+-------+-----+--------------------+------------+--------------------+--------------------+----+----------+--------------------+------------+-------------+-------------+--

In [23]:
style_df.count()

1973277

In [24]:
# clean the text by removing punctuation
text_df = style_df.withColumn("clean_text", remove_punctuation_udf(col("text")))
text_df.show()

+-------+----------------+----------+--------------------+----+-----+-----+----+-------+-----+--------------------+------------+--------------------+--------------------+----+----------+--------------------+------------+-------------+-------------+------------+-------------+-------------+------------+---------------+-------------+--------------------+--------------------+--------------------+
|beer_id|        username|      date|                text|look|smell|taste|feel|overall|score|           beer_name|review_state|               style|        availability| abv|brewery_id|        brewery_name|brewery_city|brewery_state|brewery_types|look_numeric|smell_numeric|taste_numeric|feel_numeric|overall_numeric|score_numeric|          beer_clean|         style_clean|          clean_text|
+-------+----------------+----------+--------------------+----+-----+-----+----+-------+-----+--------------------+------------+--------------------+--------------------+----+----------+--------------------+-

In [25]:
text_df.count()

1973277

In [26]:
# Combine beer_name, style, and review text into a Final_text column
from pyspark.sql.functions import concat_ws

combined_df = text_df.withColumn('Final_text', concat_ws(' ',text_df.beer_clean, text_df.style_clean, text_df.clean_text))
combined_df.select('text', 'Final_text').show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [27]:
combined_df.count()

1973277

In [28]:
# Setup and execute Tokenizer and Stop words pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.ml import Pipeline

# "Creating pipeline..."
tokenizer = Tokenizer(inputCol="Final_text", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text', outputCol='features')

pipeline = Pipeline(stages=[tokenizer, stopremove])

# "Training model..."
pipeline_stg = pipeline.fit(combined_df)
final_df = pipeline_stg.transform(combined_df)
# final_df.show()

In [29]:
final_df.show()

+-------+----------------+----------+--------------------+----+-----+-----+----+-------+-----+--------------------+------------+--------------------+--------------------+----+----------+--------------------+------------+-------------+-------------+------------+-------------+-------------+------------+---------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|beer_id|        username|      date|                text|look|smell|taste|feel|overall|score|           beer_name|review_state|               style|        availability| abv|brewery_id|        brewery_name|brewery_city|brewery_state|brewery_types|look_numeric|smell_numeric|taste_numeric|feel_numeric|overall_numeric|score_numeric|          beer_clean|         style_clean|          clean_text|          Final_text|          token_text|            features|
+-------+----------------+----------+--------------------+----+-----+-----+----+--

In [30]:
final_df.count()

1973277

In [57]:
final_df = final_df.select('score_numeric','taste_numeric','features')

In [58]:
final_df.show()

+-------------+-------------+--------------------+
|score_numeric|taste_numeric|            features|
+-------------+-------------+--------------------+
|         4.03|          4.0|[motorbreathimper...|
|          3.7|          3.5|[paybackpilsner, ...|
|         4.08|         4.25|[paybackpilsner, ...|
|         3.43|          3.5|[paybackpilsner, ...|
|         3.58|          3.5|[paybackpilsner, ...|
|         3.52|          3.5|[paybackpilsner, ...|
|         3.36|         3.25|[paybackpilsner, ...|
|         3.88|          4.0|[paybackpilsner, ...|
|         3.61|          3.5|[paybackpilsner, ...|
|         3.53|          3.5|[paybackpilsner, ...|
|         3.38|          3.5|[hellhammer, ryeb...|
|         4.24|          4.5|[hellhammer, ryeb...|
|          3.0|          3.0|[vanillaporter, a...|
|         3.85|         3.75|[vanillaporter, a...|
|          3.5|         3.75|[vanillaporter, a...|
|         3.53|          3.5|[vanillaporter, a...|
|         2.52|          2.5|[v

In [59]:
# Create w2v model hyperparameters
#word2vec = Word2Vec(
#    vectorSize=65,
#    seed=42,
#    inputCol="features",
#    outputCol="model"
#).setMaxIter(2)

In [60]:
# Fit the model, this takes time
#w2v_review_model = word2vec.fit(final_df)

In [61]:
# show some model vectors
#w2v_review_model.getVectors().show()

In [62]:
# check the model accuracy by checking synonyms 
#w2v_review_model.findSynonymsArray("light", 5)

In [63]:
# write the model to disk
#w2v_review_model.write().overwrite().save("/content/gdrive/MyDrive/Project3_BeerQuality/review.md")

In [64]:
# Read the model from disk to confirm
w2v_loaded_model = Word2VecModel.load('/content/gdrive/MyDrive/Project3_BeerQuality/review.md')

In [65]:
# check to see if synonym is consistent with loaded model
w2v_loaded_model.findSynonymsArray("light", 5)

[('medium', 0.6914238929748535),
 ('mostly', 0.6894241571426392),
 ('ligh', 0.6799790859222412),
 ('slight', 0.6678677201271057),
 ('mild', 0.6617092490196228)]

In [66]:
# Transform the mode
w2v_model_df = w2v_loaded_model.transform(final_df)

In [67]:
w2v_model_df.show()

+-------------+-------------+--------------------+--------------------+
|score_numeric|taste_numeric|            features|               model|
+-------------+-------------+--------------------+--------------------+
|         4.03|          4.0|[motorbreathimper...|[0.01448507781083...|
|          3.7|          3.5|[paybackpilsner, ...|[-0.0253503741696...|
|         4.08|         4.25|[paybackpilsner, ...|[0.18410651786969...|
|         3.43|          3.5|[paybackpilsner, ...|[0.06293463234656...|
|         3.58|          3.5|[paybackpilsner, ...|[0.05097741415932...|
|         3.52|          3.5|[paybackpilsner, ...|[-0.0508729857910...|
|         3.36|         3.25|[paybackpilsner, ...|[0.00700588056351...|
|         3.88|          4.0|[paybackpilsner, ...|[0.05808807828245...|
|         3.61|          3.5|[paybackpilsner, ...|[0.00684954653171...|
|         3.53|          3.5|[paybackpilsner, ...|[0.10990913212299...|
|         3.38|          3.5|[hellhammer, ryeb...|[0.03865336287

In [68]:
w2v_model_df = w2v_model_df.select('score_numeric', 'taste_numeric', 'model')

In [69]:
# Prepare the hyperparameters
#from pyspark.ml.regression import LinearRegression, LinearRegressionModel
#lin_model = LinearRegression(
#    maxIter=5,
#    regParam=0.3,
#    featuresCol="model",
#    labelCol="look_numeric"
#)

In [70]:
w2v_model_df.cache()
w2v_model_df.show()

+-------------+-------------+--------------------+
|score_numeric|taste_numeric|               model|
+-------------+-------------+--------------------+
|         4.03|          4.0|[0.01448507781083...|
|          3.7|          3.5|[-0.0253503741696...|
|         4.08|         4.25|[0.18410651786969...|
|         3.43|          3.5|[0.06293463234656...|
|         3.58|          3.5|[0.05097741415932...|
|         3.52|          3.5|[-0.0508729857910...|
|         3.36|         3.25|[0.00700588056351...|
|         3.88|          4.0|[0.05808807828245...|
|         3.61|          3.5|[0.00684954653171...|
|         3.53|          3.5|[0.10990913212299...|
|         3.38|          3.5|[0.03865336287325...|
|         4.24|          4.5|[-0.0377690620720...|
|          3.0|          3.0|[0.07743978367320...|
|         3.85|         3.75|[0.03384895505777...|
|          3.5|         3.75|[0.02083584428247...|
|         3.53|          3.5|[0.13457064807415...|
|         2.52|          2.5|[0

In [71]:
# split training and testing for the model (for temporary make the training set small as processing takes time)
training, testing = w2v_model_df.randomSplit([0.20, 0.80])

In [72]:
training.show()

+-------------+-------------+--------------------+
|score_numeric|taste_numeric|               model|
+-------------+-------------+--------------------+
|          1.0|          1.0|[-0.2017881472905...|
|          1.0|          1.0|[-0.1238862046040...|
|          1.0|          1.0|[-0.0939474090312...|
|          1.0|          1.0|[-0.0446819979697...|
|          1.0|          1.0|[-0.0388479160144...|
|          1.0|          1.0|[-0.0239243041723...|
|          1.0|          1.0|[-0.0021479175026...|
|          1.0|          1.0|[0.00528172007010...|
|          1.0|          1.0|[0.02388491268573...|
|          1.0|          1.0|[0.02613499033818...|
|          1.0|          1.0|[0.02968549927075...|
|          1.0|          1.0|[0.05302989255223...|
|         1.02|          1.0|[-0.0301269678748...|
|         1.03|          1.0|[-0.0169512278927...|
|         1.06|          1.0|[0.03426218031717...|
|         1.06|          1.0|[0.11149088378685...|
|         1.09|          1.0|[-

In [73]:
training_pandas_df = training.toPandas()

In [74]:
training_pandas_df.head()

Unnamed: 0,score_numeric,taste_numeric,model
0,1.0,1.0,"[-0.20178814729054767, 0.28267745922009146, -0..."
1,1.0,1.0,"[-0.12388620460405946, 0.22306803464889527, 0...."
2,1.0,1.0,"[-0.0939474090312918, 0.008056570310145617, -0..."
3,1.0,1.0,"[-0.04468199796974659, -0.013651053421199322, ..."
4,1.0,1.0,"[-0.038847916014492515, 0.15747563429176809, -..."


In [75]:
training_pandas_df.dtypes

score_numeric    float32
taste_numeric    float32
model             object
dtype: object

In [76]:
from sklearn.linear_model import LinearRegression
pandas_lin_model = LinearRegression()

In [77]:
lin_score_train_model = pandas_lin_model.fit(training_pandas_df['model'].to_list(),training_pandas_df['score_numeric'])

In [78]:
lin_score_train_model.score(training_pandas_df['model'].to_list(), training_pandas_df['score_numeric'])

0.4251067079418296

In [79]:
lin_score_train_model.coef_

array([ 0.42277002, -0.7939244 , -0.04139255, -1.6107777 ,  0.28145222,
       -0.17996243,  0.64224478, -0.13176626, -0.45666324, -0.56155694,
       -1.01648556,  0.41704179,  0.31826246,  0.25544191,  0.43563811,
        0.23355511, -0.86236149,  0.11423134, -0.98381288, -0.38529516,
        0.83746001, -0.86949696, -0.08163584,  0.78871464,  0.73032589,
        0.08029279,  0.09271434, -2.0914658 , -0.00628869,  0.23315139,
       -0.0242725 , -0.83563644,  1.11810759, -0.56981031, -0.72468188,
       -1.2551918 ,  0.10996226, -0.19029544, -0.78600287, -0.43412342,
        0.70019408,  0.02108205, -0.15271623, -0.39443487, -0.31112294,
       -0.92826364, -0.24953627,  0.23623739, -0.548317  ,  0.23460257,
        0.18414753, -0.27214456,  1.31250641, -0.11598821,  0.22573192,
        0.14551689, -0.07501608,  0.90412005, -0.85160382, -1.35754163,
       -1.57395239,  0.04388782,  0.17926557,  0.06271269,  0.15688974])

In [80]:
lin_taste_train_model = pandas_lin_model.fit(training_pandas_df['model'].to_list(),training_pandas_df['taste_numeric'])

In [81]:
lin_taste_train_model.score(training_pandas_df['model'].to_list(), training_pandas_df['taste_numeric'])

0.3758025132084842

In [82]:
lin_taste_train_model.coef_

array([ 6.08758290e-01, -9.52480871e-01, -9.11983179e-04, -1.91178273e+00,
        2.81028436e-01, -2.24600564e-01,  6.89343654e-01, -1.95331033e-01,
       -4.85949639e-01, -6.61254259e-01, -9.67178206e-01,  4.00373465e-01,
        3.61746696e-01,  2.64063408e-01,  4.92971327e-01,  1.45859690e-01,
       -1.00609659e+00,  2.00891209e-01, -1.01424635e+00, -3.66510346e-01,
        9.76006487e-01, -9.27140887e-01, -8.34261488e-02,  7.78720676e-01,
        7.78528038e-01,  1.05194880e-01,  6.70997604e-02, -2.31911618e+00,
        6.23384153e-02,  3.14882382e-01, -5.85864346e-02, -8.76174433e-01,
        1.24779938e+00, -6.14775071e-01, -8.32065594e-01, -1.29017051e+00,
        7.33183720e-02, -1.84402719e-01, -8.85419888e-01, -4.58230963e-01,
        8.21313833e-01, -1.71611098e-02, -1.00566910e-01, -4.80925938e-01,
       -3.49240327e-01, -9.80390086e-01, -4.01994630e-01,  2.48504792e-01,
       -5.57412977e-01,  3.01468299e-01,  2.41856242e-01, -2.99042668e-01,
        1.42747647e+00, -

In [83]:
# Fit the look review to a model (takes time)
#lin_score_train_model = lin_model.fit(training)

In [84]:
# Save the model for later use
#lin_look_train_model.write().overwrite().save("/content/gdrive/MyDrive/Project3_BeerQuality/lin_score_train.md")

In [85]:
# Read the model
loaded_lin_score_train_model = LinearRegressionModel.load("/content/gdrive/MyDrive/Project3_BeerQuality/lin_score_train.md")

NameError: ignored

In [None]:
# Transform the training data to peek at the predictions
lin_score_df = loaded_lin_score_train_model.transform(training)

In [None]:
# show a sample of the predictions
lin_score_df.select('look_numeric', 'prediction', 'Final_text').show(truncate=False)

In [None]:
loaded_lin_score_train_model = LinearRegressionModel.load("/content/gdrive/MyDrive/Project3_BeerQuality/lin_score_train.md")

In [None]:
lin_score_df = loaded_lin_score_train_model.transform(training)

In [None]:
lin_score_df = loaded_lin_score_train_model.transform(training)

In [None]:
#Experiment
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#evaluator = MulticlassClassificationEvaluator(
#    labelCol="look_numeric", 
#    predictionCol="prediction", 
#    metricName="accuracy"
#)

#accuracy = evaluator.evaluate(lin_score_df)

#print("Accuracy of model at predicting reviews was: %f" % accuracy)

In [None]:
print(training.rdd.getNumPartitions())

In [None]:
partitioned_df = training.repartition(2)

In [None]:
print(partitioned_df.rdd.getNumPartitions())

In [None]:
print(partitioned_df.count())

In [None]:
print(spark.sparkContext.defaultParallelism)

In [None]:
s = spark.sparkContext._jsc.sc().getExecutorMemoryStatus().keys()
print(s)
l = str(s).replace("Set(","").replace(")","").split(", ")
print(l)

d = set()
for i in l:
    d.add(i.split(":")[0])
print(len(d))

print(spark.sparkContext._jsc.sc().getExecutorMemoryStatus().size())

In [None]:
spark.sparkContext._conf.getAll() 

In [None]:
spark.sparkContext._conf.get('spark.driver.maxResultSize')

In [None]:
!cat /proc/cpuinfo

In [None]:
!cat /proc/meminfo

In [None]:
!ps -ef | grep java