### Functions for cleaning + Feature selection + Feature Engineering

In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

def load_transaction_data(trans_file_path):
    trans = pd.read_csv(trans_file_path, names=['sku', 'storeid', 'register', 'trannum', 'interID', 'saledate', 'stype', 'quantity',
                                                'orgprice', 'amt', 'seq', 'mic', 'unknown'])
    
    # Keep only purchases and positive amounts
    trans = trans[(trans['stype'] == 'P') & (trans['amt'] > 1) & (trans['orgprice'] > 1)]
    
    # Drop irrelevant columns
    trans = trans.drop(columns=['interID', 'stype', 'mic', 'unknown'])
    
    return trans

def load_sku_data(skst_file_path):
    skst = pd.read_csv(skst_file_path, names=['sku', 'storeid', 'cost', 'retail', 'unknown'])
    
    # Drop irrelevant columns
    skst = skst.drop(columns=['unknown'])
    
    # Get non-zero mean retail for each SKU
    mean_retail = skst.groupby('sku')['retail'].mean().replace(0, np.nan).fillna(0)
    
    return skst, mean_retail

def merge_dataframes(trans, skst, mean_retail):
    trans = pd.merge(trans, skst, on=['sku', 'storeid'], how='left')
    
    # Fill NaN values in 'retail' and 'orgprice' with appropriate values
    trans['retail'] = trans['retail'].fillna(trans['sku'].map(mean_retail)).fillna(trans['orgprice'])
    trans['orgprice'] = trans['orgprice'].fillna(trans['sku'].map(mean_retail))
    
    return trans

def feature_engineering(trans):
    trans['saledate'] = pd.to_datetime(trans['saledate'])
    trans['day_of_week'] = trans['saledate'].dt.dayofweek
    trans['month'] = trans['saledate'].dt.month
    trans['weekend'] = trans['day_of_week'].apply(lambda x: 1 if x >= 4 else 0)
    
    trans['amt'] = np.where(trans['amt'] == 0, trans['retail'], trans['amt'])
    trans['amt'] = np.where(trans['amt'] == 0, trans['orgprice'], trans['amt'])
    
    trans['percent_discount'] = np.maximum(0, (trans['orgprice'] - trans['amt']) / trans['orgprice'])
    trans.loc[trans['amt'] >= trans['orgprice'], 'percent_discount'] = 0
    trans.loc[trans['orgprice'] <= 0, 'percent_discount'] = 0
    trans.loc[trans['percent_discount'] < 0, 'percent_discount'] = 0
    
    trans['final_sale'] = np.where(trans['percent_discount'] > 0.5, 1, 0)
    
    return trans

def join_dataframes(input_df, csv_file_path, columns, join_key='sku', how='inner'):
    columns += [join_key]
    csv_df = pd.read_csv(csv_file_path)
    csv_df.columns = ['sku', 'deptid', 'classid', 'upc', 'style', 'color', 'size', 'packsize', 'vendor', 'brand']
    
    joined_df = pd.merge(input_df, csv_df[columns], on=join_key, how=how)
    
    return joined_df

def get_high_value_df(joined_df, n=50):
    sku_sum_revenue = joined_df.groupby('sku')['amt'].sum()
    sorted_skus = sku_sum_revenue.sort_values(ascending=False).reset_index()
    sorted_skus['cumulative_sum'] = sorted_skus['amt'].cumsum()
    
    max_revenue = sorted_skus['cumulative_sum'].max() / (100 / n)
    high_value_skus = sorted_skus[sorted_skus.cumulative_sum < max_revenue]
    high_value_df = joined_df[joined_df['sku'].isin(high_value_skus['sku'])]
    
    return high_value_df

def filter_min_average_discount(high_value_df, percentage_column='percent_discount', min_average_discount=0.03):
    average_discount = high_value_df.groupby('sku')[percentage_column].mean().reset_index(name='avg_discount')
    filtered_df = average_discount[average_discount['avg_discount'] >= min_average_discount]
    merged_df = pd.merge(high_value_df, filtered_df, on='sku', how='inner')
    
    return merged_df

### Pipeline to execute all Functions

In [2]:
def process_data(trans_file_path, skst_file_path, clean_sku_file_path):
    trans = load_transaction_data(trans_file_path)
    skst, mean_retail = load_sku_data(skst_file_path)
    trans = merge_dataframes(trans, skst, mean_retail)
    trans = feature_engineering(trans)
    
    columns = ['brand', 'classid']
    joined_df = join_dataframes(trans, clean_sku_file_path, columns)
    
    high_value_df = get_high_value_df(joined_df)
    
    filtered_df = filter_min_average_discount(high_value_df)
    
    return filtered_df

In [3]:
def process_basket_data(filtered_df, num_baskets, filter_single_baskets = False):

    baskets = filtered_df.copy()
    baskets['sku'] = baskets['sku'].astype(str)
    
    baskets = baskets.groupby(['saledate', 'storeid', 'register', 'trannum'])['sku'].agg(['count', 'nunique', list]).reset_index()
    baskets.columns = ['saledate', 'storeid', 'register', 'trannum', 'TotalItems', 'UniqueItems', 'Items']
    
    total_num_baskets = len(baskets)
    
    if(filter_single_baskets):
        baskets = baskets[(baskets['Items'].apply(len) > 1) & (baskets['UniqueItems'] > 1)]
    
    filtered_num_baskets = len(baskets)
    baskets = baskets.sample(frac=1).head(num_baskets)
    
    return baskets, total_num_baskets, filtered_num_baskets

In [4]:
import pandas as pd

def preprocess_basket_data(baskets_df, *groupby_columns):
    """
    Preprocesses basket data by converting it to a binary representation.
    Returns:
    - basket_encoded: DataFrame, the preprocessed binary representation
    """
    basket_encoded = (
        baskets_df.set_index(list(groupby_columns))['Items']
        .apply(lambda x: list(set(x)))
        .apply(pd.Series)
        .stack()
        .reset_index()
        .groupby(list(groupby_columns) + [0])
        .size()
        .unstack()
        .reset_index()
        .fillna(0)
        .set_index(list(groupby_columns))
    )
    return basket_encoded

In [5]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

def run_apriori(data, min_support, min_confidence):
    """
    Run the Apriori algorithm on the provided DataFrame and return relevant results.
    """
    # Convert the binary representation to boolean for Apriori algorithm
    basket_sets = data.astype(bool)

    # Use Apriori algorithm to find frequent itemsets
    frequent_itemsets = apriori(basket_sets, min_support=min_support, use_colnames=True)

    # Generate association rules
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)

    return frequent_itemsets, rules

# Execute Pipeline

### Option 1: Execute all:

In [6]:

# 01. Read Data and Filter trans Data
directory = 'Dillards POS/'
skst_file_path = directory + 'skstinfo.csv'
trans_file_path = directory + 'trans_final.csv'
clean_sku_file_path = directory + 'sku_final.csv'
# 02. process_data
trans_final = process_data(trans_file_path, skst_file_path, clean_sku_file_path)
print(f'trans_final shape: {trans_final.shape}')
print(f'Amount in Transaction Selection: {trans_final["amt"].sum()}')

# 03. Process_basket_data
filter_single_baskets = True
num_baskets = 100000
baskets_final,total_num_baskets, filtered_num_baskets = process_basket_data(trans_final, num_baskets, filter_single_baskets)

print(f'total num of baskets: {total_num_baskets}')
print(f'filtered num of baskets: {filtered_num_baskets}')
print(f'final selection of baskets: {baskets_final.shape}')

# 04. Preprocess_basket_data
groupby_columns = ['saledate', 'storeid', 'register', 'trannum']
basket_encoded = preprocess_basket_data(baskets_final, *groupby_columns)

# 05. Run_apriori
min_support_threshold = 0.001
min_confidence_threshold = 0.5
frequent_itemsets, rules = run_apriori(basket_encoded, min_support_threshold, min_confidence_threshold)

print("executed! Let's Go!")


trans_final shape: (23900020, 19)
Amount in Transaction Selection: 699341754.8000008
total num of baskets: 15128823
filtered num of baskets: 3520231
final selection of baskets: (100000, 7)
executed! Let's Go!


### Option 2: Execute only Model:

In [None]:
# 03. Process_basket_data
filter_single_baskets = True
num_baskets = 200000
baskets_final,total_num_baskets, filtered_num_baskets = process_basket_data(trans_final, num_baskets, filter_single_baskets)

print(f'total num of baskets: {total_num_baskets}')
print(f'filtered num of baskets: {filtered_num_baskets}')
print(f'final selection of baskets: {baskets_final.shape}')

# 04. Preprocess_basket_data
groupby_columns = ['saledate', 'storeid', 'register', 'trannum']
basket_encoded = preprocess_basket_data(baskets_final, *groupby_columns)

# 05. Run_apriori
min_support_threshold = 0.001
min_confidence_threshold = 0.5
frequent_itemsets, rules = run_apriori(basket_encoded, min_support_threshold, min_confidence_threshold)

print("executed! Let's Go!")

### Explore Rules

In [7]:
rules.sort_values('confidence', ascending = False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
25,"(6062521, 6032521)",(6072521),0.00123,0.0021,0.00107,0.869919,414.247,0.001067,7.671356,0.998815
7,(6062521),(6072521),0.00182,0.0021,0.00147,0.807692,384.615385,0.001466,5.18908,0.999219
5,(6032521),(6072521),0.00181,0.0021,0.00142,0.78453,373.585898,0.001416,4.631279,0.999132
10,(6480353),(6470353),0.00149,0.002,0.00116,0.778523,389.261745,0.001157,4.506121,0.998919
12,(6490353),(6470353),0.00182,0.002,0.0014,0.769231,384.615385,0.001396,4.324667,0.999219


In [8]:
rules.sort_values('support', ascending = False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
20,(7596135),(6656135),0.00447,0.00746,0.0025,0.559284,74.971061,0.002467,2.252109,0.991092
6,(6072521),(6062521),0.0021,0.00182,0.00147,0.7,384.615385,0.001466,3.327267,0.999499
7,(6062521),(6072521),0.00182,0.0021,0.00147,0.807692,384.615385,0.001466,5.18908,0.999219
4,(6072521),(6032521),0.0021,0.00181,0.00142,0.67619,373.585898,0.001416,3.082646,0.999422
5,(6032521),(6072521),0.00181,0.0021,0.00142,0.78453,373.585898,0.001416,4.631279,0.999132


In [9]:
rules.shape

(29, 10)

### Check if Rules seem plausible

In [10]:
frequent_itemsets.tail()

Unnamed: 0,support,itemsets
209,0.00111,"(6696135, 6656135)"
210,0.00117,"(6706135, 6656135)"
211,0.0025,"(7596135, 6656135)"
212,0.00111,"(6752521, 6742521)"
213,0.00107,"(6072521, 6062521, 6032521)"


In [11]:
basket_encoded[basket_encoded['6440353'] == 1]['6460353'].head()

saledate    storeid  register  trannum
2004-08-01  4907     70        700        1.0
2004-08-02  402      30        1400       0.0
2004-08-09  9603     590       2000       1.0
2004-08-13  3007     750       300        1.0
2004-08-18  5903     50        2500       1.0
Name: 6460353, dtype: float64

In [12]:

sku_1 = 6276633
sku_2 = 6756633

# Create a column indicating whether each row contains the specified SKUs
trans_final['has_sku_1'] = (trans_final['sku'] == sku_1)
trans_final['has_sku_2'] = (trans_final['sku'] == sku_2)

# Group by basket identifiers and count occurrences
counts = trans_final.groupby(['saledate', 'storeid', 'register', 'trannum'])[['has_sku_1', 'has_sku_2']].any().reset_index()

# Count occurrences of each SKU and both SKUs across all baskets
sku_1_count = counts['has_sku_1'].sum()
sku_2_count = counts['has_sku_2'].sum()
both_skus_count = (counts['has_sku_1'] & counts['has_sku_2']).sum()

# Print the counts
print(f"Count for SKU {sku_1}: {sku_1_count}")
print(f"Count for SKU {sku_2}: {sku_2_count}")
print(f"Count for both SKUs: {both_skus_count}")


Count for SKU 6276633: 1743
Count for SKU 6756633: 1686
Count for both SKUs: 981
