In [1]:
## Load context
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import re
import os
import pandas as pd
import numpy as np
from time import time

from pyspark.sql import functions as F
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.types import ArrayType, FloatType, StringType

hostname = os.uname()[1]

from pyspark.sql import SparkSession
from pyspark import SparkConf

# BEN'S MASTER: 192.168.2.87
# OUR MASTER:   192.168.2.203

# New API
conf = (SparkConf()    
   .setMaster("spark://192.168.2.203:7077")  
   .setAppName(f"load_local_comments; hostname: {hostname}")    
   .set("spark.executor.cores",2)  
   .set("spark.dynamicAllocation.enabled", False)  
   .set("spark.shuffle.service.enabled", False))

#spark_context = SparkContext(conf = conf)  
spark_session = SparkSession.builder.config(conf=conf).getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext
spark_context.uiWebUrl
print(f'hostname for this machine: {hostname}')

hostname for this machine: host-192-168-2-247-ldsa


In [2]:
#df = spark_context.textFile("hdfs://192.168.2.203:9000/RC_2005-12")
load_fraction = 0.01

df = spark_session.read\
    .option("header", "true")\
    .json('hdfs://192.168.2.203:9000/RC_2010-01')\
    .sample(False, load_fraction, 1234)\
    .cache()


sampled_count = df.count()
#df.show()

In [3]:
# Only local file

#df_pd = pd.read_json("sample_reddit_comments.json", lines=True)
#df = spark_session.createDataFrame(df_pd).cache()

In [4]:
### START TIMER

print(f'Starting timer. Fraction of dataset is {load_fraction} and sampled size is {sampled_count}')
start_clock = time()

Starting timer. Fraction of dataset is 0.01 and sampled size is 28813


In [5]:
df_fp = df.drop(*['permalink', 'gilded', 'author_flair_css_class', 'can_gild', 'author_flair_text', 'author_cakeday'])
df_fp.printSchema()
df_fp.show()

root
 |-- archived: boolean (nullable = true)
 |-- author: string (nullable = true)
 |-- body: string (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- downs: long (nullable = true)
 |-- edited: string (nullable = true)
 |-- id: string (nullable = true)
 |-- link_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- removal_reason: string (nullable = true)
 |-- retrieved_on: long (nullable = true)
 |-- score: long (nullable = true)
 |-- score_hidden: boolean (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- ups: long (nullable = true)

+--------+-------------------+--------------------+----------------+-----------+-------------+-----+------+-------+--------+----------+----------+--------------+------------+-----+------------+-------------+------------+---+
|archived| 

In [6]:
#import nltk
#from nltk.corpus import stopwords
#print(stopwords.words('english'))

In [7]:
# Filter functions
to_basket_unique = lambda comment: list(set((re.sub(r'\W+', ' ', comment).lower().strip().split(' '))))
udf_to_basket_unique = F.udf(to_basket_unique, ArrayType(StringType()))


def filter_words(basket):
    stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
    stopwords += ['https', 'www', 'one', 'would', 'come', 'really', 'also', 'com', 'gt', 'r', '737yli']
    stopwords += ['get', 'even', 'make', 'go', 'still', 'could', 'got', 'goes', '2', 'first', 'going', 'right', 'sure', 'something', 'http', 'well', 'back', 'though']
    return [word for word in basket if word not in stopwords]
    
udf_filter_words = F.udf(filter_words, ArrayType(StringType()))


In [8]:
df_fp1 = df_fp.withColumn('body', udf_filter_words(udf_to_basket_unique('body'))).select('body').filter(F.size(F.col('body')) > 5)
df_fp1.take(10)

[Row(body=['cat', 'dog', 'consider', 'instead', 'realizes', 'truth']),
 Row(body=['person', 'least', 'things', 'involving', 'particular', 'cliche', 'dozens', 'specific', 'picking', 'genres', 'event', 'forced', 'oranges', 'sex', 'like', 'man', 'love', 'happen', 'comparing', 'interesting', 'much', 'surely', 'apples', 'need', 'pick', 'different', 'people', 'select', 'absolute']),
 Row(body=['assume', 'thing', 'want', 'plane', 'however', 'contact', 'al', 'captured', 'qaeda', 'terrorists', 'think', 'lost']),
 Row(body=['interesting', 'create', 'grep', 'folder', 'searching', 'index', 'solution']),
 Row(body=['fine', 'protecting', 'seem', 'based', 'given', 'protect', 'matter', 'morals', 'create', 'expect', 'wrong', 'ground', 'sort', 'awareness', 'know', 'murder', 'never', 'self', 'learn', 'long']),
 Row(body=['someone', 'downvotes', 'act', 'clueless', 'start', 'banning', 'write', 'fight', 'previous', 'distribute', 'script', 'greasemonkey', 'mass', 'like', 'fascists', 'op', 'soze', 'mods', 'po

In [9]:
fpGrowth = FPGrowth(itemsCol="body", minSupport=0.01, minConfidence=0.01)
model = fpGrowth.fit(df_fp1)

In [10]:
#from functools import reduce
extract_top = [el for lis in map(lambda x: x.items, model.freqItemsets.sort("freq", ascending = False).select('items').take(30)) for el in lis]
#extract_top = reduce(list.__add__, map(lambda x: x.items, model.freqItemsets.sort("freq", ascending = False).select('items').take(30)))
print(extract_top)

['like', 'people', 'think', 'time', 'know', 'good', 'much', 'see', 'way', 'want', 'say', 'thing', 'never', 'actually', 'use', 'work', 'things', 'pretty', 'need', 'take', 'years', 'us', 'lot', 'someone', 'better', 'said', 'many', 'point', 'look', 'around']


In [11]:
# Count
c = df.count()
print(f'Total size of dataset: {c} comments')

# Display frequent itemsets.
model.freqItemsets.sort("freq", ascending = False).show()


# transform examines the input items against all the association rules and summarize the
# consequents as prediction
#model.transform(df_fp1).show()

Total size of dataset: 28813 comments
+----------+----+
|     items|freq|
+----------+----+
|    [like]|2885|
|  [people]|2123|
|   [think]|1894|
|    [time]|1539|
|    [know]|1461|
|    [good]|1331|
|    [much]|1243|
|     [see]|1128|
|     [way]|1116|
|    [want]|1005|
|     [say]| 941|
|   [thing]| 871|
|   [never]| 867|
|[actually]| 850|
|     [use]| 813|
|    [work]| 803|
|  [things]| 763|
|  [pretty]| 739|
|    [need]| 726|
|    [take]| 722|
+----------+----+
only showing top 20 rows



In [12]:
# Display generated association rules.
antecedent = 'think'

model.associationRules.filter(F.array_contains('antecedent', antecedent)).sort('confidence', ascending=False).show()

+---------------+----------+-------------------+------------------+
|     antecedent|consequent|         confidence|              lift|
+---------------+----------+-------------------+------------------+
|[think, people]|    [like]| 0.3843058350100604| 2.272000111588073|
|  [think, like]|  [people]| 0.3715953307392996|2.9853650311302373|
|        [think]|    [like]| 0.2713833157338965|1.6044068745779339|
|        [think]|  [people]| 0.2624076029567054|2.1081601865424244|
|        [think]|    [time]| 0.1525871172122492|1.6910499487798067|
|        [think]|    [know]| 0.1494192185850053|1.7443492075194047|
|        [think]|    [good]|0.13463569165786696| 1.725279005947843|
|        [think]|    [much]|0.13357972544878563|1.8329330629561447|
|        [think]|     [way]|  0.133051742344245| 2.033450284429608|
|        [think]|     [see]|0.11562829989440337|1.7483654991125388|
|        [think]|    [want]|0.11351636747624076| 1.926502650422649|
|        [think]|  [things]|0.10929250263991552|

In [13]:
### END TIMER

end_clock = time()

runtime = end_clock - start_clock

print(f'Run finished. Experiment run on {sampled_count} comments. Runtime resulted in {runtime} seconds.')

Run finished. Experiment run on 28813 comments. Runtime resulted in 35.32005286216736 seconds.
