# ASSOCIATION RULES Snippets e Scheletro

In [None]:
# Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Variables
file_name= 'File_Name.csv'
file_name_2 = 'File_Name_2.csv'
separator = 'Separator'
random_state = 42
target = 'Class_Target'

# Directives
%matplotlib inline
np.random.seed(random_state)

In [None]:
# File is a list of list of items
# Open the file
file = open(file_name, mode = 'r')
# The transactions are separated by a ';'
# We must also strip the '\n' at the end of the transaction
transactions = [line.strip('\n').split(separator) for line in file.readlines()]
print(f"First transaction:\t{transactions[0]}\n\nSecond transaction:\t{transactions[1]}")

In [None]:
# Nel caso il file csv ha una colonna index e non è necessaria
basket = df.reset_index(drop=True)

In [None]:
# Nel caso il csv è quadrato (NRow X MColumn) e ci sono celle vuote

#Converting the data frame into a list of lists
records = []
for i in range (0,basket.shape[0]):
    records.append([str(basket.values[i,j]) for j in range(0,df.shape[1])])

# generate a dataframe basket of boolean values with one row per transaction and one column per distinct item of the database
from mlxtend.preprocessing import TransactionEncoder
# Encode the transactions
encoder = TransactionEncoder()
encoded_transactions = encoder.fit_transform(records)
# Put the data in a dataframe (boolean value)
basket = pd.DataFrame(encoded_transactions.astype(bool), columns = encoder.columns_)
# Drop column nan
basket = basket.drop(columns = 'nan', axis = 1)
basket.head()

In [None]:
# Encode the list of lists into a binary representation and transform it into a dataframe whose columns are the items show the head of that dataframe
# Encode something like :
# item1;Item2
# item3
# item4;item1:Item3
# in :
# TransactionNumber Item1   Item2   Item3   Item4
# 0                 1       1       0       0
# 1                 0       0       1       0
# 2                 1       0       1       1
from mlxtend.preprocessing import TransactionEncoder
# Encode the transactions
encoder = TransactionEncoder()
encoded_transactions = encoder.fit_transform(transactions)
# Put the data in a dataframe
df = pd.DataFrame(encoded_transactions.astype(int), columns = encoder.columns_)
df.head()

In [None]:
from mlxtend.preprocessing import TransactionEncoder
def get_dataframe(file_name,separator):
    # Open the file
    file = open(file_name, mode = 'r')
    # We must also strip the '\n' at the end of the transaction
    transactions = [line.strip('\n').split(separator) for line in file.readlines()]
    # Encode the transactions
    encoder = TransactionEncoder()
    encoded_transactions = encoder.fit_transform(transactions)
    # Put the data in a dataframe
    df = pd.DataFrame(encoded_transactions.astype(int), columns = encoder.columns_)
    return transactions,df

transactions,df = get_dataframe(file_name,separator)

print(f"First transaction:\t{transactions[0]}\n\nSecond transaction:\t{transactions[1]}")
df.head()

In [None]:
# delete row containing less of n_items items
n_items = 2
single_item_transactions = []
# Iterate on all transactions
for index, transaction in basket.iterrows():
    # If there is just one "True" value
    count = 0
    for item in transaction:
        if item == True :
            count = count + 1
    if count <= n_items:
        # Save the index of the transaction
        single_item_transactions.append(index)
# We can now drop those transactions from the dataframe
basket.drop(index = single_item_transactions, axis = 0, inplace = True)
single_item_transactions

In [None]:
# Find a value of min_support such that the apriori algorithm generates at least min_itemsets frequent itemsets
#  with
# at least min_item_in_itemset items.
# Output the result with the message below

# Requirements
min_itemsets = 8
min_item_in_itemset = 2
# "Reasonable" range
support_range = np.arange(0.1, 0.01, -0.01)

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
min_support = 0
for s_value in support_range:
    print(f"Trying support value {s_value:.2f}")
    frequent_itemsets = apriori(df, min_support = s_value, use_colnames = True)
    # Calculate the number of itemsets that contain at least `min_item_in_itemset` items
    # frequent_itemsets must contains al least min_itemsets itemsset , and every item set must contains al least min_item_in_itemset item
    itemsets_above_threshold = sum([len(itemset) >= min_item_in_itemset for itemset in frequent_itemsets.itemsets])
    if itemsets_above_threshold >= min_itemsets:
        min_support = s_value
        break
if min_support == 0:
    print("No itemset found! Try again with a bigger range!")
else:
    print(f"I've selected min_support = {min_support:.2f}, which produced␣ , {len(frequent_itemsets)} itemsets, {itemsets_above_threshold} of which had more than {min_item_in_itemset} items")

In [None]:
# Same as above but with function

# Requirements
min_itemsets = 8
min_item_in_itemset = 2
# "Reasonable" range
support_range = np.arange(0.1, 0.01, -0.01)

def get_apriori_info(min_itemsets,min_item_in_itemset,support_range):
    min_support = 0
    for s_value in support_range:
        print(f"Trying support value {s_value:.2f}")
        frequent_itemsets = apriori(df, min_support = s_value, use_colnames = True)
        # Calculate the number of itemsets that contain at least <min_item_in_itemset> items
        # frequent_itemsets must contains al least <min_itemsets> itemsset , and every item set must contains al least <min_item_in_itemset> item
        itemsets_above_threshold = sum([len(itemset) >= min_item_in_itemset for itemset in frequent_itemsets.itemsets])
        if itemsets_above_threshold >= min_itemsets:
            min_support = s_value
            break

    return min_support,itemsets_above_threshold

In [None]:
# Find the minimum metric threshold such that at least min_association_rule association rules are extracted from the frequent itemsets found

#Requirment
metric_threshold_range = np.arange(20, 0.01, -0.01)
min_association_rule = 10
min_metric_threshold = 0
association_rule_found = 0
current_metric = "lift"

for metric_value in metric_threshold_range:

    rules = association_rules(frequent_itemsets, metric=current_metric, min_threshold=metric_value)
    if rules.shape[0] >= min_association_rule:
        association_rule_found = rules.shape[0]
        min_metric_threshold = metric_value
        break

if association_rule_found == 0:
    print("No association rule! Try again with a bigger range!")
else:
    print(f"I've selected metric {current_metric}  with metric_value = {metric_value:.2f}, which produced , {association_rule_found} association_rules")

In [None]:
# Find the minimum metric threshold such that at least <min_association_rule> association rules are extracted from the frequent itemsets found
def get_metric_info(metric_threshold_range,frequent_itemsets,metric,min_association_rule):
    min_metric_threshold = 0
    association_rule_found = 0
    for metric_value in metric_threshold_range:
        rules = association_rules(frequent_itemsets, metric=metric, min_threshold=metric_value)
        if rules.shape[0] >= min_association_rule:
            association_rule_found = rules.shape[0]
            min_metric_threshold = metric_value
            break
    return min_metric_threshold,association_rule_found

metric_threshold_range = np.arange(20, 0.01, -0.01)
min_association_rule = 10
min_metric_threshold,association_rule_found = get_metric_info(metric_threshold_range,frequent_itemsets,'lift',min_association_rule)
if association_rule_found == 0:
    print("No association rule! Try again with a bigger range!")
else:
    print(f"I've selected metric_value = {metric_value:.2f}, which produced , {association_rule_found} association_rules")

In [None]:
# Print the first n rules found, sorted by descending confidence and support
n_rule = 10
sorted_rules=rules.sort_values(by=['confidence','support'],ascending=False).reset_index(drop=True)
sorted_rules.head(n_rule)

In [None]:
# Plot confidence and support for all the sorted rules found
sorted_rules[['confidence','support']].plot(title='Association Rules');

In [None]:
# Scatter plot the rules by confidence and support, labelling the points with the index value of the corresponding rule
# size_point is chosen empirically to obtain the best graphical effect
size_point = 1.4
# The size of each point
s = [size_point**n for n in rules.lift]

rules.plot.scatter(x='support',
                   y='confidence',
                   title='Association Rules (dot proportional to Lift)',
                   s=s);

In [None]:
# Scatter plot the rules by confidence and support, labelling the points with the index value of the corresponding rule
fig = sorted_rules.plot.scatter(
                                x='confidence',
                                y='support',
                                title='Association Rules'
                                )

# Iterate over all the rules and annotate them with their index
for i in range(len(sorted_rules)):
    fig.annotate(text = i, xy = (sorted_rules['confidence'][i], sorted_rules['support'][i]))