# Strategy: find frequent patterns in concepts discussed in posts

In [113]:
# Necessary imports
import findspark
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyspark
from pyspark.ml.fpm import FPGrowth
from pyspark.sql import SparkSession
from pyspark.sql.functions import split
pd.set_option('max_colwidth',300)

In [67]:
# Here's the file with concept baskets
# This is generated by the notebook Represent_posts_as_frequent_concepts_by_cluster.ipynb
concept_baskets_for_analysis = "posts_as_baskets_of_concepts.txt"

In [92]:
# load in the lookup table translating cluster id to cluster name (roughly its main concepts)
lookup = "Cluster_lookup_42.csv"
cluster_lookup = pd.read_csv(lookup,header=None,index_col=0,names=['cluster_name'])

## Begin mining for frequent patterns and association rules in Spark

In [133]:
# Here's the start-up of Spark
findspark.init()

spark = SparkSession.builder \
        .master("local") \
        .appName("ConceptPatterns") \
        .getOrCreate()

In [134]:
# baskets = spark.read.text()
baskets = (spark.read
    .text(concept_baskets_for_analysis)
    .select(split("value", ",").alias("items")))

In [135]:
baskets.printSchema()
baskets.head()
total_posts = baskets.count()

root
 |-- items: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [339]:
# This is where the operation graph is defined (NOT where it is executed)
min_support = .005
min_confidence = 0.66
fp = FPGrowth(minSupport=min_support,minConfidence=min_confidence)
fpm = fp.fit(baskets)
top_items = fpm.freqItemsets.orderBy(["freq"],ascending = False)

In [340]:
# BEWARE: This is the Spark action step, where the tree is grown
concept_patterns = top_items.toPandas()

In [341]:
# Quick analysis of patterns and counts
concept_patterns.rename(columns={"items": "concepts"},inplace=True)
print("Detected {} patterns".format(concept_patterns.shape[0]))
concept_patterns['numConcepts'] = concept_patterns.apply(lambda x: len(x.concepts),axis=1)
concept_patterns['concept_names'] = concept_patterns.apply(lambda x: [cluster_lookup.iloc[int(id)].cluster_name for id in x.concepts],axis=1)
concept_patterns['freq_pct'] = concept_patterns.apply(lambda x: 100*x.freq/total_posts,axis=1)
concept_patterns.numConcepts.value_counts().sort_index(inplace=True)

Detected 4580 patterns


In [342]:
concept_patterns.head()

Unnamed: 0,concepts,freq,numConcepts,concept_names,freq_pct
0,[16],156618,1,[ Would_could_may],65.203707
1,[7],133733,1,[ Know_think_express_recall],55.676151
2,"[7, 16]",97941,2,"[ Know_think_express_recall, Would_could_may]",40.775111
3,[38],82259,1,[ #_digit_s],34.24633
4,[33],65777,1,[ Pain_fatigue_bodypart],27.384491


In [None]:
# Finally, these are the association rules that were derived
assoc_rules = fpm.associationRules
rules = assoc_rules.toPandas()
rules['numAntecedents'] = rules.apply(lambda x: len(x.antecedent),axis=1)
rules['antecedent_concepts'] = rules.apply(lambda x: [cluster_lookup.iloc[int(id)].cluster_name for id in x.antecedent],axis=1)
rules['consequent_concepts'] = rules.apply(lambda x: str([cluster_lookup.iloc[int(id)].cluster_name for id in x.consequent]),axis=1)
num_rules = len(rules)
print("{:d} association rules were generated below for your inspection.".format(num_rules))
print("Given presence of the antecedant, the confidence level listed says we'll see the consequent as well.")
print("Minimum acceptable confidence was set to {:.1%}.".format(min_confidence))
print("ASSOCIATION RULES ******************************")
rules.sort_values(by=['numAntecedents','confidence'], inplace=True,ascending=False)
rules[['antecedent_concepts','consequent_concepts','confidence','lift']].head()


In [344]:
rules.groupby(['consequent_concepts','numAntecedents']).agg({'lift':'count'}).rename(mapper={'lift':'scenarios'},axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,scenarios
consequent_concepts,numAntecedents,Unnamed: 2_level_1
[' #_digit_s'],1,1
[' #_digit_s'],2,67
[' #_digit_s'],3,346
[' #_digit_s'],4,537
[' #_digit_s'],5,310
[' #_digit_s'],6,70
[' #_digit_s'],7,2
[' Know_think_express_recall'],1,15
[' Know_think_express_recall'],2,257
[' Know_think_express_recall'],3,645


In [345]:
association_consequents = [phrase[3:-2] for phrase in reversed(list(rules.consequent_concepts.value_counts().index))]

In [346]:
association_consequents

['Singular_doc_or_HCP',
 'Pain_fatigue_bodypart',
 'Plural_time_units',
 '#_digit_s',
 'Know_think_express_recall',
 'Would_could_may']

In [None]:
# Rules that lead to docs
rules[rules.consequent_concepts.str.contains('HCP')]\
    [['antecedent_concepts','confidence','lift']]

In [None]:
# Rules that lead to pain
rules[rules.consequent_concepts.str.contains('Pain')]\
    [['antecedent_concepts','confidence','lift']]

In [351]:
for concept_cluster in association_consequents:
    print(f"{rules[rules.consequent_concepts.str.contains(concept_cluster)].shape[0]} sets of antecedents associate with *{concept_cluster}*")
    rules[rules.consequent_concepts.str.contains(concept_cluster)]\
        [['antecedent_concepts','confidence','lift']].to_csv(f"Concept_mining_{concept_cluster}.csv",header=True,index=False)

204 sets of antecedents associate with *Singular_doc_or_HCP*
306 sets of antecedents associate with *Pain_fatigue_bodypart*
589 sets of antecedents associate with *Plural_time_units*
1333 sets of antecedents associate with *#_digit_s*
2027 sets of antecedents associate with *Know_think_express_recall*
2199 sets of antecedents associate with *Would_could_may*
