In [1]:
## Load context
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import re
import os
import pandas as pd
import numpy as np
hostname = os.uname()[1]

from pyspark.sql import SparkSession

# BEN'S MASTER: 192.168.2.87
# OUR MASTER:   192.168.2.203

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.87:7077") \
        .appName(f"load_local_comments; hostname: {hostname}")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.blockManager.port", "10025")\
        .config("spark.driver.blockManager.port", "10026")\
        .config("spark.driver.port", "10027")\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext
spark_context.uiWebUrl

'http://host-192-168-2-247-ldsa:4040'

In [2]:
import pyspark.sql.functions as f

df_pd = pd.read_json("sample_reddit_comments.json", lines=True)
df = spark_session.createDataFrame(df_pd).cache()

In [3]:
from pyspark.sql import functions as F
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.types import ArrayType, FloatType, StringType

In [4]:
df_fp = df.drop(*['permalink', 'gilded', 'author_flair_css_class', 'can_gild', 'author_flair_text', 'author_cakeday'])
df_fp.printSchema()
df_fp.show()

root
 |-- author: string (nullable = true)
 |-- body: string (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- created_utc: long (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- edited: long (nullable = true)
 |-- id: string (nullable = true)
 |-- is_submitter: boolean (nullable = true)
 |-- link_id: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- retrieved_on: long (nullable = true)
 |-- score: long (nullable = true)
 |-- stickied: boolean (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)

+------------------+--------------------+----------------+-----------+-------------+------+-------+------------+---------+----------+------------+-----+--------+------------------+------------+
|            author|                body|controversiality|created_utc|distinguished|edited|     id|is_submitter|  link_id| parent_id|retrieved_on|score|stickied|         subreddit|subreddit_id|
+-

In [None]:
#import nltk
#from nltk.corpus import stopwords
#print(stopwords.words('english'))

In [52]:
# Filter functions
to_basket_unique = lambda comment: list(set((re.sub(r'\W+', ' ', comment).lower().strip().split(' '))))
udf_to_basket_unique = F.udf(to_basket_unique, ArrayType(StringType()))


def filter_words(basket):
    stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
    stopwords += ['https', 'www', 'one', 'would', 'come', 'really', 'also', 'com', 'gt', 'r', '737yli']
    stopwords += ['get', 'even', 'make', 'go', 'still', 'could', 'got', 'goes', '2', 'first', 'going', 'right', 'sure', 'something']
    return [word for word in basket if word not in stopwords]
    
udf_filter_words = F.udf(filter_words, ArrayType(StringType()))


In [53]:
df_fp1 = df_fp.withColumn('body', udf_filter_words(udf_to_basket_unique('body'))).select('body')
df_fp1.take(10)

[Row(body=['quarry']),
 Row(body=['salutations', 'imgur', 'http', '9ttainh', 'said']),
 Row(body=['majors', 'crazy', 'cain', 'baseball', 'playing', 'teared', 'matt', 'bit', 'started', 'see', 'time']),
 Row(body=['tory', 'fucking']),
 Row(body=['see', 'water', 'dragon']),
 Row(body=['michigan', 'wait', 'club', 'like', 'state', 'u']),
 Row(body=['ye', 'fam']),
 Row(body=['2004', 'united', '2000', 'hillary', '143417804', 'buchanan', 'states', 'oldfag', '2012', 'dole', '1996', 'liakfevh', '2016', 'op', 'bush', '1984', 'obama', '1992', 'reagan', 'kerry', 'id', '143412250', 'anonymous', '2008', '1988']),
 Row(body=['chicken', 'salad', 'running', 'shit', 'outta']),
 Row(body=['know', 'rules'])]

In [54]:
fpGrowth = FPGrowth(itemsCol="body", minSupport=0.01, minConfidence=0.05)
model = fpGrowth.fit(df_fp1)

In [None]:
#from functools import reduce
extract_top = [el for lis in map(lambda x: x.items, model.freqItemsets.sort("freq", ascending = False).select('items').take(30)) for el in lis]
#extract_top = reduce(list.__add__, map(lambda x: x.items, model.freqItemsets.sort("freq", ascending = False).select('items').take(30)))
print(extract_top)

In [56]:
# Display frequent itemsets.
model.freqItemsets.sort("freq", ascending = False).show()

# Display generated association rules.
model.associationRules.filter(F.array_contains('antecedent', 'removed')).show()

# transform examines the input items against all the association rules and summarize the
# consequents as prediction
#model.transform(df_fp1).show()

['like', 'see', 'people', 'deleted', 'think', 'good', 'time', 'know', 'reddit', 'way', 'much', 'well', 'want', 'game', 'deep', 'removed', 'lets', 'hole', 'comments', 'need', 'deep', 'see', 'use', 'hole', 'see', 'lets', 'see', 'though', 'rabbit', 'rabbit', 'lets', 'rabbit', 'lets', 'deep', 'rabbit', 'lets', 'deep', 'see', 'rabbit', 'lets', 'see']
+----------+----+
|     items|freq|
+----------+----+
|    [like]|1017|
|     [see]| 639|
|  [people]| 553|
| [deleted]| 529|
|   [think]| 510|
|    [good]| 504|
|    [time]| 469|
|    [know]| 424|
|  [reddit]| 367|
|     [way]| 367|
|    [much]| 346|
|    [well]| 336|
|    [want]| 327|
|    [game]| 288|
|    [deep]| 282|
| [removed]| 278|
|    [lets]| 273|
|    [hole]| 271|
|[comments]| 270|
|    [need]| 268|
+----------+----+
only showing top 20 rows

+----------+----------+----------+----+
|antecedent|consequent|confidence|lift|
+----------+----------+----------+----+
+----------+----------+----------+----+

