In [16]:

!pip install pandas openpyxl mlxtend
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

df = pd.read_excel('Online retail.xlsx')
df.head()


  



Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil"
0,"burgers,meatballs,eggs"
1,chutney
2,"turkey,avocado"
3,"mineral water,milk,energy bar,whole wheat rice..."
4,low fat yogurt


# Data Preprocessing

In [17]:


# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Remove duplicate rows
df = df.drop_duplicates()

# Split the items in each transaction into a list
df['items'] = df.iloc[:, 0].apply(lambda x: x.split(','))

# Create a list of all unique items
all_items = sorted(set(item for sublist in df['items'] for item in sublist))



Missing values in each column:
 shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil    0
dtype: int64


In [18]:
# Create a binary matrix
encoded_vals = []
for index, row in df.iterrows():
    row_dict = {item: (item in row['items']) for item in all_items}
    encoded_vals.append(row_dict)

# Convert list of dicts to DataFrame
encoded_df = pd.DataFrame(encoded_vals)

# Apply the Apriori algorithm
frequent_itemsets = apriori(encoded_df, min_support=0.01, use_colnames=True)

# Generate the association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

# Display the rules
print(rules.head())


       antecedents      consequents  antecedent support  consequent support  \
0  (mineral water)        (almonds)            0.299710            0.029179   
1        (almonds)  (mineral water)            0.029179            0.299710   
2        (avocado)      (chocolate)            0.045797            0.205217   
3      (chocolate)        (avocado)            0.205217            0.045797   
4        (avocado)   (french fries)            0.045797            0.192657   

    support  confidence      lift  leverage  conviction  zhangs_metric  
0  0.010821    0.036106  1.237399  0.002076    1.007186       0.273962  
1  0.010821    0.370861  1.237399  0.002076    1.113092       0.197619  
2  0.010242    0.223629  1.089716  0.000843    1.023715       0.086281  
3  0.010242    0.049906  1.089716  0.000843    1.004325       0.103588  
4  0.011594    0.253165  1.314069  0.002771    1.081019       0.250476  


# Analysis and Interpretation

In [19]:
# Sort rules by confidence
rules = rules.sort_values(by='confidence', ascending=False)

# Display the top 10 rules
print("Top 10 association rules sorted by confidence:\n", rules.head(10))


Top 10 association rules sorted by confidence:
                           antecedents      consequents  antecedent support  \
792                      (soup, milk)  (mineral water)            0.021449   
710  (frozen vegetables, ground beef)  (mineral water)            0.024541   
829                 (soup, spaghetti)  (mineral water)            0.020676   
763           (pancakes, ground beef)  (mineral water)            0.020870   
498              (chicken, chocolate)  (mineral water)            0.021256   
775                 (milk, olive oil)  (mineral water)            0.024155   
716  (frozen vegetables, ground beef)      (spaghetti)            0.024541   
599            (chocolate, olive oil)  (mineral water)            0.023575   
751               (milk, ground beef)  (mineral water)            0.031691   
666               (eggs, ground beef)  (mineral water)            0.028792   

     consequent support   support  confidence      lift  leverage  conviction  \
792         

In [21]:

for index, rule in rules.head(10).iterrows():
    antecedents = ", ".join(list(rule['antecedents']))
    consequents = ", ".join(list(rule['consequents']))
    support = rule['support']
    confidence = rule['confidence']
    lift = rule['lift']
    print(f"Rule: {antecedents} -> {consequents}")
    print(f"Support: {support:.2f}, Confidence: {confidence:.2f}, Lift: {lift:.2f}")
    print()



Rule: soup, milk -> mineral water
Support: 0.01, Confidence: 0.58, Lift: 1.92

Rule: frozen vegetables, ground beef -> mineral water
Support: 0.01, Confidence: 0.54, Lift: 1.81

Rule: soup, spaghetti -> mineral water
Support: 0.01, Confidence: 0.52, Lift: 1.75

Rule: pancakes, ground beef -> mineral water
Support: 0.01, Confidence: 0.52, Lift: 1.73

Rule: chicken, chocolate -> mineral water
Support: 0.01, Confidence: 0.52, Lift: 1.73

Rule: milk, olive oil -> mineral water
Support: 0.01, Confidence: 0.51, Lift: 1.71

Rule: frozen vegetables, ground beef -> spaghetti
Support: 0.01, Confidence: 0.51, Lift: 2.23

Rule: chocolate, olive oil -> mineral water
Support: 0.01, Confidence: 0.51, Lift: 1.70

Rule: milk, ground beef -> mineral water
Support: 0.02, Confidence: 0.51, Lift: 1.69

Rule: eggs, ground beef -> mineral water
Support: 0.01, Confidence: 0.50, Lift: 1.68

