In [118]:
import pandas as pd

In [119]:
# TASK 1: Warm Up

df = pd.read_csv('www.csv', sep='\t')
df.loc[:, df.isnull().any()]
print('Number of rows before removing duplicates:', len(df)) 
df = df.drop_duplicates('Query', keep='first') 
df.reset_index(drop=True, inplace=True) 
print('Number of rows after removing duplicates:', len(df))
print("Number of attributes without the the 'UserId' and the 'Query' attributes:", len(df.columns) - 2)

Number of rows before removing duplicates: 9999
Number of rows after removing duplicates: 9816
Number of attributes without the the 'UserId' and the 'Query' attributes: 500


In [120]:
# Task 2: Generation of Frequent Itemsets

import pyfpgrowth

associations = df['Query'].apply(lambda x: x.split()).tolist()
num_records = len(associations)
print(f'Number of records: {num_records}')

# 2.1

# 2.2
sigma = 100.0
min_support = sigma / num_records
patterns = pyfpgrowth.find_frequent_patterns(associations, sigma) 

patterns = dict(sorted(patterns.items(), key=lambda item: item[1], reverse=True))
print(patterns)

num_frequent_itemsets = len(patterns) 
max_itemset_size = max(len(itemset) for itemset in patterns) 
 
print(f'Number of frequent itemsets: {num_frequent_itemsets}') 
print(f'Maximum size of frequent itemsets: {max_itemset_size}')

support = {key: value / len(df) for key, value in patterns.items()} 

items_with_min_support = [itemset for itemset, sup in support.items() if sup >= min_support]
print(f'Number of itemsets with support >= {min_support:.4f}: {len(items_with_min_support)}')

Number of records: 9816
{('of',): 955, ('in',): 844, ('for',): 553, ('and',): 546, ('to',): 426, ('a',): 340, ('county',): 237, ('on',): 232, ('free',): 225, ('http',): 224, ('new',): 222, ('of', 'the'): 207, ('how',): 178, ('lyrics',): 175, ('school',): 154, ('florida',): 136, ('city',): 132, ('how', 'to'): 130, ('what',): 128, ('state',): 123, ('is',): 118, ('you',): 117, ('i',): 117, ('in', 'the'): 117, ('home',): 113, ('my',): 110, ('with',): 104, ('high',): 101}
Number of frequent itemsets: 28
Maximum size of frequent itemsets: 2
Number of itemsets with support >= 0.0102: 28


In [121]:
# Task 3: Effect of Support

min_support_values = [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 
0.009, 0.01] 

for min_support in min_support_values:  
    support_count = min_support * num_records  
    patterns = pyfpgrowth.find_frequent_patterns(associations, support_count) 
 
    support = {key: value / len(df) for key, value in patterns.items()} 
    print(f"Minimum support: {min_support}, Support count: {support_count:.0f}") 
    print(f'Found {len(patterns)} patterns') 
    print('Patterns', patterns) 
    print('Support', support)     
    print()




Minimum support: 0.001, Support count: 10
Found 2281 patterns
Patterns {('value',): 10, ('fair',): 10, ('plan',): 10, ('poker',): 10, ('flower',): 10, ('case',): 10, ('background',): 10, ('block',): 10, ('fast',): 10, ('buffalo',): 10, ('chicken',): 10, ('but',): 10, ('ocean',): 10, ('places',): 10, ('camps',): 10, ('atlantic',): 10, ('charles',): 10, ('brooklyn',): 10, ('here',): 10, ('mother',): 10, ('sign',): 10, ('create',): 10, ('maps',): 10, ('arkansas',): 10, ('smith',): 10, ('fucking',): 10, ('treatment',): 10, ('b',): 10, ('catholic',): 10, ('francisco',): 10, ('course',): 10, ('open',): 10, ('signs',): 10, ('coast',): 10, ('into',): 10, ('into', 'the'): 11, ('boat',): 10, ('learn',): 10, ('legal',): 10, ('zip',): 10, ('taylor',): 10, ('drive',): 10, ('maine',): 10, ('review',): 10, ('poems',): 10, ('models',): 10, ('ball',): 10, ('dr',): 10, ('own',): 10, ('support',): 10, ('kiss',): 10, ('stone',): 10, ('flash',): 10, ('7',): 10, ('regional',): 10, ('vacation',): 10, ('army'

In [122]:
# TASK 4: Generating the Rules

min_support = 0.04
support_count = 39
confidence = 0.8 
 
patterns = pyfpgrowth.find_frequent_patterns(associations, support_count) 
print("Patterns", patterns)
rules = pyfpgrowth.generate_association_rules(patterns, confidence) 
print("Rules", rules)
print(f'Number of rules generated: {len(rules)}')



Patterns {('inn',): 39, ('union',): 39, ('st.',): 39, ('washington',): 40, ('make',): 40, ('code',): 40, ('nj',): 40, ('song',): 40, ('myspace',): 41, ('west',): 41, ('auto',): 41, ('up',): 42, ('vegas',): 42, ('line',): 42, ('la',): 42, ('public',): 42, ('jersey',): 42, ('community',): 42, ('schools',): 42, ('picture',): 42, ('are',): 42, ('company',): 42, ('nc',): 42, ('wedding',): 43, ('carolina',): 43, ('com',): 44, ('air',): 44, ('or',): 44, ('book',): 44, ('de',): 44, ('blue',): 45, ('south',): 45, ('national',): 45, ('las',): 45, ('map',): 45, ('best',): 46, ('tax',): 46, ('store',): 46, ('white',): 46, ('one',): 46, ('church',): 46, ('old',): 47, ('hotels',): 47, ('island',): 47, ('me',): 48, ('movie',): 48, ('jobs',): 48, ('lake',): 48, ('bank',): 49, ('michigan',): 49, ('chicago',): 49, ('st',): 49, ('phone',): 50, ('pa',): 50, ('it',): 50, ('department',): 51, ('games',): 52, ('hospital',): 52, ('homes',): 53, ('girls',): 53, ('north',): 54, ('california',): 54, ('hotel',): 

In [123]:
# TASK 5: Impact of Confidence) 

min_confidence_range = [0.1, 0.3, 0.5, 0.7, 0.9]  
 
for min_conf in min_confidence_range: 
    rules = pyfpgrowth.generate_association_rules(patterns, min_conf) 
    num_rules = len(rules) 
    print(f"Minimum confidence: {min_conf}") 
    print(f"Number of generated rules: {num_rules}") 
    print()

Minimum confidence: 0.1
Number of generated rules: 32

Minimum confidence: 0.3
Number of generated rules: 27

Minimum confidence: 0.5
Number of generated rules: 19

Minimum confidence: 0.7
Number of generated rules: 16

Minimum confidence: 0.9
Number of generated rules: 12



In [124]:
# TASK 6: rule interpretation

min_support = 0.001
min_conf = 0.01

support_count = min_support * num_records  
patterns = pyfpgrowth.find_frequent_patterns(associations, support_count) 
patterns = dict(sorted(patterns.items(), key=lambda item: item[1], reverse=True))
rules = pyfpgrowth.generate_association_rules(patterns, min_conf) 
rules = dict(sorted(rules.items(), key=lambda item: item[1][1], reverse=True))

support = {key: value / len(df) for key, value in patterns.items()} 
print(f"Minimum support: {min_support}, Support count: {support_count:.0f}") 
print(f'Found {len(patterns)} patterns') 
print(f'Number of rules generated: {len(rules)}')
print("Rules", rules)
print('Patterns', patterns) 
print('Support', support)     
print()

# Assuming 'patterns' and 'rules' have been generated as in your previous code

# Create a new list to store the rules with all their metrics
rules_with_confidence = []

for antecedent, (consequent, lift) in rules.items():
    # The count of the antecedent (A)
    antecedent_count = patterns.get(antecedent)
    
    # Combine the antecedent and consequent to form the full itemset (A U B)
    # IMPORTANT: The keys in the 'patterns' dictionary are sorted tuples.
    # You must sort the combined itemset before looking it up.
    combined_itemset = tuple(sorted(antecedent + consequent))
    
    # The count of the antecedent and consequent together (A U B)
    combined_count = patterns.get(combined_itemset)
    
    if antecedent_count and combined_count:
        # Calculate confidence
        confidence = combined_count / antecedent_count
        
        rules_with_confidence.append({
            'antecedent': antecedent,
            'consequent': consequent,
            'confidence': confidence,
            'lift': lift,
            'antecedent_support_count': antecedent_count,
            'combined_support_count': combined_count
        })

rules_with_confidence = sorted(rules_with_confidence, key=lambda x: x['confidence'], reverse=True)

for rule in rules_with_confidence:
    print(f"Rule: {rule['antecedent']} -> {rule['consequent']}")
    print(f"  Confidence: {rule['confidence']:.4f}")
    print(f"  Lift: {rule['lift']:.2f}\n")


Minimum support: 0.001, Support count: 10
Found 2281 patterns
Number of rules generated: 1494
Rules {('and', 'i', 'the', 'to', 'to', 'you'): (('your',), 12.0), ('i', 'i', 'to', 'to', 'to', 'you'): (('the',), 4.5), ('and', 'i', 'in', 'of', 'the', 'the'): (('your',), 4.0), ('and', 'i', 'the', 'the', 'you'): (('your',), 4.0), ('and', 'in', 'the', 'the', 'you'): (('your',), 4.0), ('and', 'i', 'in', 'of', 'the', 'to', 'to'): (('your',), 4.0), ('and', 'in', 'the', 'to', 'to', 'you'): (('your',), 4.0), ('i', 'in', 'the', 'to', 'to', 'you'): (('your',), 4.0), ('a', 'a', 'the', 'was'): (('of', 'who'), 4.0), ('and', 'and', 'i', 'in', 'of'): (('the', 'to'), 4.0), ('a', 'and', 'and', 'i', 'in'): (('the', 'to'), 4.0), ('is', 'the', 'the', 'the'): (('your',), 4.0), ('i', 'of', 'to', 'to', 'to', 'you'): (('the',), 3.857142857142857), ('i', 'i', 'the', 'the', 'you'): (('to',), 3.6666666666666665), ('and', 'i', 'i', 'that'): (('the', 'to'), 3.6), ('i', 'of', 'the', 'the', 'you'): (('to',), 3.2941176470