### Functions for cleaning + Feature selection + Feature Engineering

In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

def load_transaction_data(trans_file_path):
    trans = pd.read_csv(trans_file_path, names=['sku', 'storeid', 'register', 'trannum', 'interID', 'saledate', 'stype', 'quantity',
                                                'orgprice', 'amt', 'seq', 'mic', 'unknown'])
    
    # Keep only purchases and positive amounts
    trans = trans[(trans['stype'] == 'P') & (trans['amt'] > 1) & (trans['orgprice'] > 1)]
    
    # Drop irrelevant columns
    trans = trans.drop(columns=['interID', 'stype', 'mic', 'unknown'])
    
    return trans

def load_sku_data(skst_file_path):
    skst = pd.read_csv(skst_file_path, names=['sku', 'storeid', 'cost', 'retail', 'unknown'])
    
    # Drop irrelevant columns
    skst = skst.drop(columns=['unknown'])
    
    # Get non-zero mean retail for each SKU
    mean_retail = skst.groupby('sku')['retail'].mean().replace(0, np.nan).fillna(0)
    
    return skst, mean_retail

def merge_dataframes(trans, skst, mean_retail):
    trans = pd.merge(trans, skst, on=['sku', 'storeid'], how='left')
    
    # Fill NaN values in 'retail' and 'orgprice' with appropriate values
    trans['retail'] = trans['retail'].fillna(trans['sku'].map(mean_retail)).fillna(trans['orgprice'])
    trans['orgprice'] = trans['orgprice'].fillna(trans['sku'].map(mean_retail))
    
    return trans

def feature_engineering(trans):
    trans['saledate'] = pd.to_datetime(trans['saledate'])
    trans['day_of_week'] = trans['saledate'].dt.dayofweek
    trans['month'] = trans['saledate'].dt.month
    trans['weekend'] = trans['day_of_week'].apply(lambda x: 1 if x >= 4 else 0)
    
    trans['amt'] = np.where(trans['amt'] == 0, trans['retail'], trans['amt'])
    trans['amt'] = np.where(trans['amt'] == 0, trans['orgprice'], trans['amt'])
    
    trans['percent_discount'] = np.maximum(0, (trans['orgprice'] - trans['amt']) / trans['orgprice'])
    trans.loc[trans['amt'] >= trans['orgprice'], 'percent_discount'] = 0
    trans.loc[trans['orgprice'] <= 0, 'percent_discount'] = 0
    trans.loc[trans['percent_discount'] < 0, 'percent_discount'] = 0
    
    trans['final_sale'] = np.where(trans['percent_discount'] > 0.5, 1, 0)
    
    return trans

def join_dataframes(input_df, csv_file_path, columns, join_key='sku', how='inner'):
    columns += [join_key]
    csv_df = pd.read_csv(csv_file_path)
    csv_df.columns = ['sku', 'deptid', 'classid', 'upc', 'style', 'color', 'size', 'packsize', 'vendor', 'brand']
    
    joined_df = pd.merge(input_df, csv_df[columns], on=join_key, how=how)
    
    return joined_df

def get_high_value_df(joined_df, n=50):
    sku_sum_revenue = joined_df.groupby('sku')['amt'].sum()
    sorted_skus = sku_sum_revenue.sort_values(ascending=False).reset_index()
    sorted_skus['cumulative_sum'] = sorted_skus['amt'].cumsum()
    
    max_revenue = sorted_skus['cumulative_sum'].max() / (100 / n)
    high_value_skus = sorted_skus[sorted_skus.cumulative_sum < max_revenue]
    high_value_df = joined_df[joined_df['sku'].isin(high_value_skus['sku'])]
    
    return high_value_df

def filter_min_average_discount(high_value_df, percentage_column='percent_discount', min_average_discount=0.03):
    average_discount = high_value_df.groupby('sku')[percentage_column].mean().reset_index(name='avg_discount')
    filtered_df = average_discount[average_discount['avg_discount'] >= min_average_discount]
    merged_df = pd.merge(high_value_df, filtered_df, on='sku', how='inner')
    
    return merged_df

### Pipeline to execute all Functions

In [7]:
def process_data(trans_file_path, skst_file_path, clean_sku_file_path):
    trans = load_transaction_data(trans_file_path)
    skst, mean_retail = load_sku_data(skst_file_path)
    trans = merge_dataframes(trans, skst, mean_retail)
    trans = feature_engineering(trans)
    
    columns = ['brand', 'classid']
    joined_df = join_dataframes(trans, clean_sku_file_path, columns)
    
    high_value_df = get_high_value_df(joined_df)
    
    filtered_df = filter_min_average_discount(high_value_df)
    
    return filtered_df

In [2]:
def process_basket_data(trans_file_path, skst_file_path, clean_sku_file_path, num_baskets):
    trans = load_transaction_data(trans_file_path)
    skst, mean_retail = load_sku_data(skst_file_path)
    trans = merge_dataframes(trans, skst, mean_retail)
    trans = feature_engineering(trans)
    
    columns = ['brand', 'classid']
    joined_df = join_dataframes(trans, clean_sku_file_path, columns)
    
    high_value_df = get_high_value_df(joined_df)
    
    filtered_df = filter_min_average_discount(high_value_df)
    
    baskets = filtered_df.copy()
    baskets['sku'] = baskets['sku'].astype(str)
    
    baskets = baskets.groupby(['saledate', 'storeid', 'register', 'trannum'])['sku'].agg(['count', 'nunique', list]).reset_index()
    baskets.columns = ['saledate', 'storeid', 'register', 'trannum', 'TotalItems', 'UniqueItems', 'Items']
    
    baskets = baskets[(baskets['Items'].apply(len) > 1) & (baskets['UniqueItems'] > 1)].head(num_baskets)
    
    return baskets

In [3]:
# Run all functions with one click
directory = 'Dillards POS/'
skst_file_path = directory + 'skstinfo.csv'
trans_file_path = directory + 'trans_final.csv'
clean_sku_file_path = directory + 'sku_final.csv'

baskets_final = process_basket_data(trans_file_path, skst_file_path, clean_sku_file_path, 100000)
baskets_final.shape

(100000, 7)

In [None]:
trans_final = process_data(trans_file_path, skst_file_path, clean_sku_file_path)
trans_final.shape

In [None]:
print(trans_final['amt'].sum())

## Apriori Algorithm

In [4]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import pandas as pd

# Convert the data to a one-hot encoded format
basket_encoded = baskets_final.set_index(['saledate', 'storeid', 'register', 'trannum'])['Items'].apply(pd.Series).stack().reset_index().groupby(['saledate', 'storeid', 'register', 'trannum', 0]).size().unstack().reset_index().fillna(0).set_index(['saledate', 'storeid', 'register', 'trannum'])

In [5]:
# Convert the binary representation to boolean for Apriori algorithm
basket_sets = basket_encoded.astype(bool)

# Use Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(basket_sets, min_support=0.001, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)

# Display the rules
print(rules)

          antecedents       consequents  antecedent support  \
0           (1563503)         (1453503)             0.00174   
1           (1453503)         (1563503)             0.00133   
2           (1563503)         (1543503)             0.00174   
3           (1543503)         (1563503)             0.00142   
4           (1761637)         (1801637)             0.00178   
..                ...               ...                 ...   
489  (878635, 858635)          (888635)             0.00142   
490  (888635, 858635)          (878635)             0.00133   
491          (878635)  (888635, 858635)             0.00226   
492          (888635)  (878635, 858635)             0.00230   
493          (858635)  (878635, 888635)             0.00202   

     consequent support  support  confidence        lift  leverage  \
0               0.00133  0.00100    0.574713  432.114770  0.000998   
1               0.00174  0.00100    0.751880  432.114770  0.000998   
2               0.00142  0.00113 

### Explore Rules

In [6]:
rules.sort_values('confidence', ascending = False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
196,(6276633),(6756633),0.00196,0.00212,0.00189,0.964286,454.851752,0.001886,27.94064,0.999761
298,(8018679),(8188679),0.00188,0.00195,0.00178,0.946809,485.542826,0.001776,18.76334,0.99982
411,"(6340353, 6320353)",(6300353),0.0011,0.00219,0.00101,0.918182,419.261104,0.001008,12.195456,0.998713
299,(8188679),(8018679),0.00195,0.00188,0.00178,0.912821,485.542826,0.001776,11.449024,0.99989
428,"(6520353, 6510353)",(6500353),0.00116,0.00229,0.00105,0.905172,395.271796,0.001047,10.521305,0.998629


### Check if Rules seem plausible