Strategy: identify any frequent sequential patterns in where authors posted (where ppl post first, then next, etc.)

In [1]:
# Necessary imports
import findspark
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth",200)
import pyspark
from pyspark.sql import  Row, SparkSession
from pyspark.ml.fpm import PrefixSpan
from pyspark.sql.functions import *
import random

In [2]:
# Source files
raw_forum_posts = "FullForumHistory.csv"
board_sequences = "BoardSequenceResults.txt"

In [None]:
# Read in data
data = pd.read_csv(raw_forum_posts)
print("We have {} individual posts by {} unique users".format(data.shape[0],len(data.user_name.value_counts())))
data.head()

In [None]:
data.date = pd.to_datetime(data.date)
data.sort_values(by=['date'], inplace=True)
print("Our posts span {} until {}".format(str(data.date.iloc[0]),str(data.date.iloc[-1])))

In [5]:
def truncate_boardname(boardname):
    board_words = boardname.split(' ')
    if len(board_words) == 0:
        return 'got_zero_len_boardname'
    elif len(board_words) == 1:
        return board_words[0] + '_' + '_'
    elif len(board_words) == 2:
        return '_'.join([board_words[0],board_words[1]]) + '_'
    else:
        return '_'.join([board_words[0],board_words[1],board_words[2]])


In [6]:
data['board_short'] = data.apply(lambda x: truncate_boardname(x.board_name),axis=1)

In [7]:
# Filter down dataset and groupby user
board_postings = data.filter(['user_name','board_short','date'])
board_post_seqs_by_user = data.groupby('user_name').agg(lambda x: list(x))

In [8]:
board_post_seqs_by_user['cumu_posts'] = board_post_seqs_by_user.apply(lambda x: len(x.board_name),axis=1)

In [27]:
# What are the quantiles of users?
board_post_seqs_by_user.cumu_posts.quantile([.01,.25,.5,.8,.9,.99])

0.01      1.0
0.25      1.0
0.50      3.0
0.80     11.0
0.90     30.0
0.99    349.0
Name: cumu_posts, dtype: float64

In [9]:
# For the 9%, we filter to ONLY those with 30 <= cumu_posts < 350
board_post_seqs_by_user = board_post_seqs_by_user[(board_post_seqs_by_user.cumu_posts >= 30) & (board_post_seqs_by_user.cumu_posts < 350)]

In [None]:
board_post_seqs_by_user.shape

In [None]:
# Spot check we're in date order 
audit = random.choice(range(0,board_post_seqs_by_user.shape[0]))
auditee = board_post_seqs_by_user.index[audit]
for pair in zip(board_post_seqs_by_user.date[audit],board_post_seqs_by_user.board_short[audit]):
    print(auditee,pair)

In [33]:
# # Investigating head-of-thread posts versus replies
# def prepend_type_truncate_boardname(boardname,post_type):
#     if post_type == 'thread':
#         prepend = 'Th____'
#     else:
#         prepend = 'R____'
#     board_words = boardname.split(' ')
#     if len(board_words) == 0:
#         return 'got_zero_len_boardname'
#     elif len(board_words) == 1:
#         return prepend + '*' + board_words[0] + '_' + '_'
#     elif len(board_words) == 2:
#         return prepend + '*' + '_'.join([board_words[0],board_words[1]]) + '_'
#     else:
#         return prepend + '*' + '_'.join([board_words[0],board_words[1],board_words[2]])

In [13]:
# data['type_board_short'] = data.apply(lambda x: prepend_type_truncate_boardname(x.board_name,x.doc_type),axis=1)

In [14]:
# # Filter down dataset and groupby user
# board_postings = data.filter(['user_name','type_board_short','date'])
# board_post_seqs_by_user = data.groupby('user_name').agg(lambda x: list(x))

In [50]:
# # Spot check we're in date order 
# audit = random.choice(range(0,board_post_seqs_by_user.shape[0]))
# auditee = board_post_seqs_by_user.index[audit]
# for pair in zip(board_post_seqs_by_user.date[audit],board_post_seqs_by_user.type_board_short[audit]):
#     print(auditee,pair)

In [None]:
# Does above match raw file lookup, in order?
data.loc[data.user_name == auditee] 

In [20]:
# # Peel off board sequences and write them out 
# board_seqs = np.array(board_post_seqs_by_user.type_board_short)
# with open("board_sequencesAndTypes.txt","w") as f:
#     f.write("\n".join(" ".join(map(str, x)) for x in board_seqs))

In [11]:
# Peel off board sequences and write them out 
board_seqs = np.array(board_post_seqs_by_user.board_short)
with open("the9_board_sequences.txt","w") as f:
    f.write("\n".join(" ".join(map(str, x)) for x in board_seqs))

Begin mining for frequent sequential patterns in Spark

In [12]:
# Now start Spark
findspark.init()
spark = SparkSession.builder.master("local").appName("board_sequencer").getOrCreate()
sc = spark.sparkContext

In [13]:
lines = []
with open(board_sequences) as f:  
    for cnt, line in enumerate(f):
        lines.append(line.strip())
        
seqs = {}
for i in range(len(lines)):
    seqs[i] = [[board] for board in lines[i].split(' ')]

In [14]:
for j in seqs.keys():
    seqs[j] = Row(sequence=seqs[j])

In [15]:
makey_framey = tuple()
for key in seqs.keys():
    new=(seqs[key],)
    makey_framey = makey_framey + new

In [16]:
df = sc.parallelize(makey_framey).toDF()

In [17]:
user_count = df.count()
print("Ready to analyze {} users' board-posting sequences".format(user_count))

Ready to analyze 1062 users' board-posting sequences


In [None]:
df.head()

In [20]:
# Use PrefixSpan to look for freqent sequential patterns (which are like author itineraries in their forum journey)
minSupport = 0.05 # Pattern must occur in this proportion of user sequences, or gets ignored
# In other words, a sequential pattern appearing more than minSupport * numUsers will be output 
maxPatternLength = 6 # The maximal length of sequential pattern we are seeking
prefixSpan = PrefixSpan(minSupport=minSupport, maxPatternLength=maxPatternLength)
output = prefixSpan.findFrequentSequentialPatterns(df).sort("freq",ascending=False).cache() # this is the action step


In [21]:
pdf = output.toPandas()

In [22]:
pdf['pattern_length'] = pdf.apply(lambda x: len(x.sequence) ,axis=1)

In [23]:
for i in range(1,pdf.pattern_length.max()+1):
    pdf[pdf.pattern_length == i][['sequence','freq']]\
    .to_csv("Sequential_patterns_of_length_{}.csv".format(i), sep=",",columns = ['sequence','freq'], index=False)