In [2]:
## Load context
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import re
import os
import pandas as pd
import numpy as np
import pyspark.sql.functions as f

hostname = os.uname()[1]

from pyspark.sql import SparkSession
from pyspark import SparkConf

# BEN'S MASTER: 192.168.2.87
# OUR MASTER:   192.168.2.203

# New API
conf = (SparkConf()    
   .setMaster("spark://192.168.2.203:7077")  
   .setAppName(f"load_local_comments; hostname: {hostname}")    
   .set("spark.executor.cores",2)  
   .set("spark.dynamicAllocation.enabled", False)  
   .set("spark.shuffle.service.enabled", False))

#spark_context = SparkContext(conf = conf)  
spark_session = SparkSession.builder.config(conf=conf).getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext
spark_context.uiWebUrl
print(f'hostname for this machine: {hostname}')

hostname for this machine: host-192-168-2-247-ldsa


In [3]:
#df = spark_context.textFile("hdfs://192.168.2.203:9000/RC_2005-12")
df = spark_session.read\
    .option("header", "true")\
    .json('hdfs://192.168.2.203:9000/RC_2005-12')\
    .cache()

df.show()

+----------+----------------------+-----------------+--------------------+----------------+-----------+-------------+------+------+---+--------+---------+------------+-----+--------+----------+------------+---+
|    author|author_flair_css_class|author_flair_text|                body|controversiality|created_utc|distinguished|edited|gilded| id| link_id|parent_id|retrieved_on|score|stickied| subreddit|subreddit_id|ups|
+----------+----------------------+-----------------+--------------------+----------------+-----------+-------------+------+------+---+--------+---------+------------+-----+--------+----------+------------+---+
|      frjo|                  null|             null|A look at Vietnam...|               0| 1134365188|         null| false|     0|c13|t3_17863| t3_17863|  1473738411|    2|   false|reddit.com|        t5_6|  2|
|   zse7zse|                  null|             null|The site states "...|               0| 1134365725|         null| false|     0|c14|t3_17866| t3_17866|  

In [3]:
# Only local file

#df_pd = pd.read_json("sample_reddit_comments.json", lines=True)
#df = spark_session.createDataFrame(df_pd).cache()

In [4]:
from pyspark.sql import functions as F
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.types import ArrayType, FloatType, StringType

In [5]:
df_fp = df.drop(*['permalink', 'gilded', 'author_flair_css_class', 'can_gild', 'author_flair_text', 'author_cakeday'])
df_fp.printSchema()
df_fp.show()

root
 |-- author: string (nullable = true)
 |-- body: string (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- created_utc: long (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- edited: boolean (nullable = true)
 |-- id: string (nullable = true)
 |-- link_id: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- retrieved_on: long (nullable = true)
 |-- score: long (nullable = true)
 |-- stickied: boolean (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- ups: long (nullable = true)

+----------+--------------------+----------------+-----------+-------------+------+---+--------+---------+------------+-----+--------+----------+------------+---+
|    author|                body|controversiality|created_utc|distinguished|edited| id| link_id|parent_id|retrieved_on|score|stickied| subreddit|subreddit_id|ups|
+----------+--------------------+----------------+-----------+-----------

In [6]:
#import nltk
#from nltk.corpus import stopwords
#print(stopwords.words('english'))

In [7]:
# Filter functions
to_basket_unique = lambda comment: list(set((re.sub(r'\W+', ' ', comment).lower().strip().split(' '))))
udf_to_basket_unique = F.udf(to_basket_unique, ArrayType(StringType()))


def filter_words(basket):
    stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
    stopwords += ['https', 'www', 'one', 'would', 'come', 'really', 'also', 'com', 'gt', 'r', '737yli']
    stopwords += ['get', 'even', 'make', 'go', 'still', 'could', 'got', 'goes', '2', 'first', 'going', 'right', 'sure', 'something']
    return [word for word in basket if word not in stopwords]
    
udf_filter_words = F.udf(filter_words, ArrayType(StringType()))


In [8]:
df_fp1 = df_fp.withColumn('body', udf_filter_words(udf_to_basket_unique('body'))).select('body')
df_fp1.take(10)

[Row(body=['look', 'myth', 'liberalisation', 'market', 'vietnam', 'mexico', 'exposes']),
 Row(body=['sign', 'competitive', 'new', 'pretty', 'meeting', 'levels', 'fom', 'want', 'proposals', 'store', 'states', 'honest', 'web', 'everything', 'prevent', 'specs', 'guarantee', 'sheets', 'data', 'multiple', 'reports', 'operators', 'stealing', 'site', 'gains', 'sites', 'accessing', 'encryption', 'like', 'use', 'breeed', 'prevents', 'good', 'useful', 'call', 'notes', 'much', 'setting', 'paranoid', 'etc', 'security', 'personal', 'technical', 'us']),
 Row(body=['frank', 'topics', 'wierzbicki', 'related', 'jython']),
 Row(body=['deleted']),
 Row(body=['far', 'tak', 'safari', 'extension', 'saft', 'best', 'onto']),
 Row(body=['deleted']),
 Row(body=['shots', 'simply', 'panoramic', 'take']),
 Row(body=['stuff', 'market', 'concept', 'beginning', 'phrases', 'copying', 'pasting', 'web', 'save', 'thatâ', 'surprise', 'client', 'came', 'cool', 'know', 'time', 'search', 'qube', 'donâ', 'taken', 'blossom', '

In [9]:
fpGrowth = FPGrowth(itemsCol="body", minSupport=0.01, minConfidence=0.05)
model = fpGrowth.fit(df_fp1)

In [10]:
#from functools import reduce
extract_top = [el for lis in map(lambda x: x.items, model.freqItemsets.sort("freq", ascending = False).select('items').take(30)) for el in lis]
#extract_top = reduce(list.__add__, map(lambda x: x.items, model.freqItemsets.sort("freq", ascending = False).select('items').take(30)))
print(extract_top)

['think', 'like', 'people', 'deleted', 'good', 'http', 'article', 'know', 'time', 'see', 'reddit', 'much', 'use', 'many', 'way', 'interesting', 'work', 'great', 'better', 'well', 'want', 'point', 'things', 'read', 'language', 'thing', 'paul', 'problem', 'lot', 'actually']


In [11]:
# Count
c = df.count()
print(f'Total size of dataset: {c} comments')

# Display frequent itemsets.
model.freqItemsets.sort("freq", ascending = False).show()

# Display generated association rules.
antecedent = 'people'

model.associationRules.filter(F.array_contains('antecedent', antecedent)).sort('confidence', ascending=False).show()

# transform examines the input items against all the association rules and summarize the
# consequents as prediction
#model.transform(df_fp1).show()

Total size of dataset: 1075 comments
+-------------+----+
|        items|freq|
+-------------+----+
|       [like]| 143|
|      [think]| 143|
|     [people]| 125|
|    [deleted]| 108|
|       [good]| 100|
|       [http]|  99|
|    [article]|  94|
|       [know]|  89|
|       [time]|  86|
|        [see]|  79|
|     [reddit]|  77|
|       [much]|  75|
|        [use]|  67|
|       [many]|  67|
|        [way]|  62|
|[interesting]|  62|
|       [work]|  61|
|      [great]|  59|
|     [better]|  58|
|       [well]|  57|
+-------------+----+
only showing top 20 rows

+---------------+----------+------------------+------------------+
|     antecedent|consequent|        confidence|              lift|
+---------------+----------+------------------+------------------+
| [work, people]|    [like]|               0.6|  4.51048951048951|
| [good, people]|   [think]|0.5666666666666667|  4.25990675990676|
| [people, like]|   [think]|0.4146341463414634|3.1170049462732385|
| [good, people]|    [like]|   