# import & load

In [26]:
import pandas as pd
import os
from IPython.display import clear_output
import math

In [27]:
# install mlxtend library
!pip install mlxtend
clear_output(wait = False)

In [28]:
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules

In [29]:
# Load csvs
current_working_directory = os.getcwd()

df_distance = pd.read_csv(current_working_directory + '\\jimi_distance.csv')
df_visits = pd.read_csv(current_working_directory + '\\jimi_visits_full.csv',parse_dates=['DateTime'])
df_mia = pd.read_csv(current_working_directory + '\\from_edgar_top53.csv')

# Load up similar_location_dict
similar_location_dict ={'Anthropologie':'Apple',
                        'Bench':'Little Burgundy',
                        'Bluenotes':'Browns Shoes',
                        'Bvlgari':'Mulberry',
                        'Calendar Club':'Melanie Lyne',
                        'Childrens Place, The':'Pink',
                        'Davids Tea':'Microsoft',
                        'Diamond Company':'Cos',
                        'Kim Capone':'Melanie Lyne',
                        'LensCrafters':'House of Hoops by Foot Locker',
                        'Macs Sushi':'Purdys Chocolates',
                        'Mappins':'Papyrus',
                        'Marciano':'Little Burgundy',
                        'Mendocino':'Williams Sonoma',
                        'Nine West':'Aldo Accessories',
                        'Restoration Hardware':'Sporting Life',
                        'Rockport':'Indochino',
                        'Starbucks':'Papyrus'
                       }

In [30]:
def distance(location_1,location_2):
    '''
    input: 2 locations in string
    output: distances of 2 locations (order shouldn't matter, if so, error message will be printed)
    additional notes: need df_distancere to be jimi_distance.csv, also, this take in consideration of similar_location_dict
    '''
    if location_1 in similar_location_dict.keys():
        location_1 = similar_location_dict[location_1]
    if location_2 in similar_location_dict.keys():
        location_2 = similar_location_dict[location_2]
    pair = str(sorted([location_1,location_2]))
    distance = df_distance[df_distance.store_pairs == pair].Distance.unique()
    if len(distance) == 1:
        return distance[0]
    else:
        print('found more than 1 distances for this pair:' + pair)

# this function returns all sorted 2-location pairs from a list of multiple locations
def get_2_location_pairs(list_of_locations):
    '''
    input: a list of locations(in string) (len(list) must be >= 2)
    output: a list of sorted 2-location lists
    '''
    solution = []
    for item in range(0,len(list_of_locations)):
        pair_1 = list_of_locations[item]
        pair_2 = list_of_locations.copy()
        pair_2.pop(item)
        for pair in pair_2:
            output_pair = sorted([pair_1,pair])
            if output_pair not in solution:
                solution.append(output_pair)
    return solution

# this function returns a distance metric for any-sized location list
def final_distance(list_of_locations):
    '''
    input: a list of locations(in string) (len(list) must be >= 1)
    output: one distance(float) metric (max distance of every 2-pair)
    additional notes: if length of input is 1 then distance returned will always be 0.0
    '''
    final_distance = 0.0
    if len(list_of_locations) != 1:
        for n in get_2_location_pairs(list_of_locations):
            final_distance = max(final_distance , distance(n[0],n[1]))
        return final_distance
    else:
        return 0.0
    
# this function calculates distances for column of df and populate df with a new column 'distances'
def list_of_distances(df,column):
    '''
    input: dataframe, column_name(string)
    output: populate on the same dataframe a new column named 'distances' with distances populated
    '''
    output = []
    distance_input = list(df[column].apply(lambda x : list(x)))
    for n in distance_input:
        output.append(final_distance(n))
    df['distances'] = output

# Association Rules

In [31]:
df_visits.head()

Unnamed: 0,DateTime,Customer_ID,Sequence_Num,Store_ID,Dwell_Time,Name
0,2018-01-01 06:00:30,1,0,5079483,56,Le Chateau
1,2018-01-01 13:22:30,293,6,5079483,6,Le Chateau
2,2018-01-01 13:51:50,344,0,5079483,12,Le Chateau
3,2018-01-01 14:21:30,400,4,5079483,18,Le Chateau
4,2018-01-01 18:45:50,843,4,5079483,21,Le Chateau


In [35]:
# make a list of top 40 stores
top_53 = list(df_mia.top_53)
top_53 = [int(i) for i in top_53]

# get all indexes to keep
top_53_index = []
for store in top_53:
    top_53_index += list(df_visits[df_visits.Store_ID == store].index)

# only keep top 40 stores from df_visits
df_visits_dropped = df_visits.iloc[top_53_index,:]


assert len(df_visits_dropped.Store_ID.unique()) == 53

df_visits_dropped.shape

(232932, 6)

In [36]:
# preping df_visits to the right format for association rules
df_visits_enc = pd.get_dummies(df_visits_dropped[['Customer_ID', 'DateTime', 'Name']], 
                               columns = ['Name'], prefix='', prefix_sep='')

df_visits_enc = df_visits_enc.groupby(['Customer_ID', 'DateTime']).max().reset_index()

df_visits_enc.drop(['Customer_ID', 'DateTime'], axis=1, inplace=True)

df_visits_enc

Unnamed: 0,Apple,Arc teryx,Aritzia,Babaton,Birks,Bitter Sweet,Brandy Melville,Browns Shoes,Burberry,Bvlgari,...,Ted Baker,Tesla Motors,Thomas Sabo,Tiffany & Co,Tory Burch,Van Cleef & Arpels,Versace,Virgin Mobile,Warby Parker,Zara
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
df_visits_dropped.shape[0] * 0.00005

11.646600000000001

In [43]:
# get a table of itemsets
min_support = 0.00005
frequent_itemsets = fpgrowth(df_visits_enc, min_support=min_support, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

In [44]:
frequent_itemsets_dropped = frequent_itemsets

In [45]:
# examine output, any length
frequent_itemsets_dropped = frequent_itemsets_dropped.sort_values(['support'], ascending=[False])
frequent_itemsets_dropped.head()

Unnamed: 0,support,itemsets,length
0,0.137424,(Apple),1
36,0.094415,(Zara),1
22,0.063939,(Sephora),1
29,0.062245,(Crate & Barrel),1
9,0.055155,(Canada Goose),1


In [46]:
# run association rules
min_threshold = 1
rules = association_rules(frequent_itemsets, min_threshold = min_threshold, metric = 'lift')
rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))

In [47]:
# populate distances column for rules
ant_list = list(rules.antecedents.apply(lambda x : list(x)))
con_str = list(rules.consequents.apply(lambda x : list(x)[0]))
for n in range(0,len(ant_list)):
    ant_list[n].append(con_str[n])
rules['ant_con_list'] = ant_list
list_of_distances(rules,'ant_con_list')

In [48]:
# drop stores from df_visits
rules_dropped = rules

In [51]:
# examine output, any length
rules_dropped = rules_dropped.sort_values(by=["lift"], ascending=[False])
rules_dropped = rules_dropped[(rules_dropped.distances > 140) & (rules_dropped.support > min_support)]

rules_dropped

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len,ant_con_list,distances
56,"(Canada Goose, Apple)",(Arc teryx),0.001303,0.010358,6e-05,0.046154,4.456068,4.7e-05,1.037528,2,"[Canada Goose, Apple, Arc teryx]",394.96
57,"(Arc teryx, Apple)",(Canada Goose),0.000326,0.055155,6e-05,0.184615,3.347213,4.2e-05,1.158772,2,"[Arc teryx, Apple, Canada Goose]",394.96
113,(Tesla Motors),"(Pottery Barn and Pottery Barn, Apple)",0.026693,0.001353,5e-05,0.001877,1.387516,1.4e-05,1.000525,1,"[Tesla Motors, Pottery Barn and Pottery Barn]",278.94
110,"(Pottery Barn and Pottery Barn, Apple)",(Tesla Motors),0.001353,0.026693,5e-05,0.037037,1.387516,1.4e-05,1.010742,2,"[Pottery Barn and Pottery Barn, Apple, Tesla M...",278.94
120,(Lucky Brand Jeans),(Browns Shoes),0.009551,0.028302,0.000356,0.037251,1.31621,8.5e-05,1.009295,1,"[Lucky Brand Jeans, Browns Shoes]",328.59
121,(Browns Shoes),(Lucky Brand Jeans),0.028302,0.009551,0.000356,0.012571,1.31621,8.5e-05,1.003058,1,"[Browns Shoes, Lucky Brand Jeans]",328.59
156,(Lucky Brand Jeans),(ECCO),0.009551,0.005943,7e-05,0.007345,1.235961,1.3e-05,1.001413,1,"[Lucky Brand Jeans, ECCO]",287.51
157,(ECCO),(Lucky Brand Jeans),0.005943,0.009551,7e-05,0.011804,1.235961,1.3e-05,1.002281,1,"[ECCO, Lucky Brand Jeans]",287.51
112,(Apple),"(Pottery Barn and Pottery Barn, Tesla Motors)",0.137424,0.000316,5e-05,0.000365,1.15504,7e-06,1.000049,1,"[Apple, Pottery Barn and Pottery Barn]",269.87
111,"(Pottery Barn and Pottery Barn, Tesla Motors)",(Apple),0.000316,0.137424,5e-05,0.15873,1.15504,7e-06,1.025326,2,"[Pottery Barn and Pottery Barn, Tesla Motors, ...",278.94
