**Required Libraties**

In [39]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import regex as re
from sentence_transformers import SentenceTransformer, util

In [None]:
# !pip3 install sentence_transformers

**Data Source: Furniture Sales**

In [2]:
# Data Source: 
# References: https://www-users.cse.umn.edu/~kumar001/dmbook/ch6.pdf, https://medium.com/edureka/apriori-algorithm-d7cc648d4f1e, https://medium.com/edureka/apriori-algorithm-d7cc648d4f1e, https://medium.com/@mervetorkan/association-rules-with-python-9158974e761a
df = pd.read_csv("./Superstore KPIs.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Order ID      9994 non-null   object 
 1   Order Date    9994 non-null   object 
 2   Region        9994 non-null   object 
 3   Segment       9994 non-null   object 
 4   Category      9994 non-null   object 
 5   Sub-Category  9994 non-null   object 
 6   Product Name  9994 non-null   object 
 7   Ship Date     9994 non-null   object 
 8   Ship Mode     9994 non-null   object 
 9   Profit        9994 non-null   float64
 10  Quantity      9994 non-null   int64  
 11  Sales         9994 non-null   float64
dtypes: float64(2), int64(1), object(9)
memory usage: 937.1+ KB


**Association Analysis: Preprocessing**


In [3]:
#Preprocessing 1: Clean up Product Name. Remove Numbers and odd characters
df['Product Name'] = df['Product Name'].str.replace('\d+', '')
df['Product Name'].str.strip()
regex = re.compile('[^a-zA-Z]')
[regex.sub('', i) for i in df['Product Name']]

#Preprocessing 2: Group by Order ID
def groupdataset(dataset,groupbysecondval):
  associated_df = (dataset.groupby(['Order ID', groupbysecondval])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('Order ID'))
  return associated_df 

#Preprocessing 3: Convert all values that is not 0 to 1
def encode_units(x):
    return 1 if x >= 1 else 0

  


**Association Analysis: Apiori Algorithm**

In [4]:
#Apiori Algorithm
def apiori(dataset, minsupport):
  basket_sets = dataset.applymap(encode_units)
  frequent_itemsets = apriori(basket_sets, min_support=minsupport, use_colnames=True)
  rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
  return rules

#Filter out bad results 
# def filterlift(x):
#   x[(x['lift'] >= 6) & (x['confidence'] >= 0.8)]
#   return x

**Association Analysis: Result 1 - Category**

In [5]:
#Checking Cateogry - didn't learn much from it
associated_df = groupdataset(df,'Category')
rules = apiori(associated_df, 0.01)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(Office Supplies, Technology)",(Furniture),0.182472,0.352166,0.064684,0.354486,1.006587,0.000423,1.003594
1,"(Office Supplies, Furniture)",(Technology),0.193452,0.308245,0.064684,0.334365,1.084738,0.005053,1.039241
2,(Technology),"(Office Supplies, Furniture)",0.308245,0.193452,0.064684,0.209845,1.084738,0.005053,1.020746
3,(Furniture),"(Office Supplies, Technology)",0.352166,0.182472,0.064684,0.183673,1.006587,0.000423,1.001472


**Association Analysis: Result 2 - SubCategory**

In [6]:
#Checking Sub Category - Need a further breakdown of the following categories: Appliances, Art, Furnishings, Storage
associated_df = groupdataset(df,'Sub-Category')
rules = apiori(associated_df, 0.02)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Binders),(Appliances),0.262727,0.090038,0.025953,0.098784,1.09714,0.002298,1.009705
1,(Appliances),(Binders),0.090038,0.262727,0.025953,0.288248,1.09714,0.002298,1.035857
2,(Paper),(Appliances),0.237772,0.090038,0.021761,0.09152,1.016458,0.000352,1.001631
3,(Appliances),(Paper),0.090038,0.237772,0.021761,0.241685,1.016458,0.000352,1.00516
4,(Phones),(Art),0.162507,0.145937,0.024755,0.152334,1.043833,0.00104,1.007546
5,(Art),(Phones),0.145937,0.162507,0.024755,0.169631,1.043833,0.00104,1.008578
6,(Furnishings),(Phones),0.175085,0.162507,0.029547,0.168757,1.038458,0.001094,1.007518
7,(Phones),(Furnishings),0.162507,0.175085,0.029547,0.181818,1.038458,0.001094,1.00823
8,(Furnishings),(Storage),0.175085,0.155121,0.02755,0.157355,1.014401,0.000391,1.002651
9,(Storage),(Furnishings),0.155121,0.175085,0.02755,0.177606,1.014401,0.000391,1.003066


**Association Analysis: Result 3 - Product Name**

In [7]:
#Checking Product Name - Confidence is too low. Need to generalize some of the descriptions of the Product Name
df2 = df
df2["ProductName_SubCategory"] = df[['Sub-Category','Product Name']].agg(' '.join, axis=1)
associated_df = groupdataset(df2,'ProductName_SubCategory')
rules = apiori(associated_df, 0.001) #Played with the min_support val and highest threshhold is 0.1%
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Paper Xerox ),(Accessories Kingston Digital DataTraveler GB ...,0.153124,0.004991,0.001198,0.007823,1.567353,0.000434,1.002854
1,(Accessories Kingston Digital DataTraveler GB ...,(Paper Xerox ),0.004991,0.153124,0.001198,0.24,1.567353,0.000434,1.11431
2,(Art Newell ),(Paper Xerox ),0.050908,0.153124,0.007986,0.156863,1.024414,0.00019,1.004434
3,(Paper Xerox ),(Art Newell ),0.153124,0.050908,0.007986,0.052151,1.024414,0.00019,1.001311
4,(Envelopes Staple envelope),(Paper Xerox ),0.009583,0.153124,0.001597,0.166667,1.08844,0.00013,1.016251
5,(Paper Xerox ),(Envelopes Staple envelope),0.153124,0.009583,0.001597,0.01043,1.08844,0.00013,1.000856
6,(Fasteners Staples),(Paper Xerox ),0.009183,0.153124,0.001797,0.195652,1.277734,0.000391,1.052872
7,(Paper Xerox ),(Fasteners Staples),0.153124,0.009183,0.001797,0.011734,1.277734,0.000391,1.002581
8,(Paper Xerox ),"(Storage Hot File -Pocket, Floor Stand)",0.153124,0.002595,0.001397,0.009126,3.516498,0.001,1.006591
9,"(Storage Hot File -Pocket, Floor Stand)",(Paper Xerox ),0.002595,0.153124,0.001397,0.538462,3.516498,0.001,1.834897


**Generalizing Description of product names for the following sub categories**


1.   Appliances
2.   Furnishing
3.   Storage



In [68]:
#Need to deploy NLP Model to generalize product name
#Reference: https://towardsdatascience.com/text-summarization-with-nlp-textrank-vs-seq2seq-vs-bart-474943efeb09
appliances_df = df[(df['Sub-Category'] == "Appliances")]
furnishing_df = df[(df['Sub-Category'] == "Furniture")]
storage_df = df[(df['Sub-Category'] == "Storage")]

# for i in range (len(appliances_df)):
#   appliances_df['description'] = (df['Product Name'][i] +" " + df['Product Name'][i+i])

In [None]:
storage_df

In [64]:
model = SentenceTransformer('all-MiniLM-L6-v2')
productname1 =[]
productname2 = []

#Sentences we like to encode
sentences = appliances_df[['Product Name']]
[productname1.append(i.split()) for i in sentences]
print(productname1)
# [productname2.append(i) for i in sentences]

# #Compute embedding for both lists
# embeddings1 = model.encode(productname1, convert_to_tensor=True)
# embeddings2 = model.encode(productname2, convert_to_tensor=True)

# #Compute cosine-similarities
# cosine_scores = util.cos_sim(embeddings1, embeddings2)

# #Output the pairs with their score
# for i in range(len(productname1)):
#       print("{} \t\t {} \t\t Score: {:.4f}".format(productname1[i], productname2[i], cosine_scores[i][i]))

[['Product', 'Name']]
