Strategy: Take posts represented as baskets of hand-picked vocabulary terms, and return association rules (market-basket style).

In [1]:
import findspark
import json
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.functions import split
pd.options.display.max_colwidth = 500

findspark.init()

spark = SparkSession.builder \
        .master("local") \
        .appName("JD") \
        .getOrCreate()

In [None]:
posts_file = "Posts_as_baskets_of_very_important_vocabulary.txt"
data = (spark.read
    .text(posts_file)
    .select(split("value", ",").alias("items")))
post_count = data.count()
print(post_count, "count of posts in our 5-year snapshot")

-----
Mining for frequent patterns in vocabulary usage

In [49]:
# Given datafile with ~100K records, 1/100 support runs on my laptop
# 1/1000 support does not complete
fp = FPGrowth(minSupport=.01,minConfidence=0.8)
fpm = fp.fit(data)

In [50]:
top_items = fpm.freqItemsets.orderBy(["freq"],ascending = False)

In [51]:
# This is the action step; see DAG 
post_patterns = top_items.toPandas()

In [52]:
post_patterns.rename(columns={"items": "words"},inplace=True)

In [53]:
post_patterns.shape

(12151, 2)

In [37]:
post_patterns['numWords'] = post_patterns.apply(lambda x: len(x.words),axis=1)

In [38]:
post_patterns['freq_pct'] = post_patterns.apply(lambda x: x.freq/post_count,axis=1)

In [35]:
post_patterns.loc[1:5, ['words','freq','freq_pct']]

Unnamed: 0,words,freq,freq_pct
1,[good luck],4646,0.063994
2,[don know],3383,0.046597
3,[years ago],2938,0.040468
4,[feel like],2254,0.031046
5,[http www],2249,0.030978


In [39]:
post_patterns.numWords.value_counts()

3    5371
2    4659
4    1658
1     799
5     135
6       1
Name: numWords, dtype: int64

In [None]:
assoc_rules = fpm.associationRules
rules = assoc_rules.toPandas()
rules.head()