In [43]:
# C1B Perform the appropriate encoding method (i.e., ordinal, label encoding, one-hot encoding) for each variable selected in part C1a.

import pandas as pd

# load data
file_path = "Megastore Dataset.csv"
df = pd.read_csv(file_path)

# remove dupes
df = df.drop_duplicates()

# drop rows with missing values in key fields
df = df.dropna(subset=["OrderID", "ProductName"])

# standardize category fields
for c in ["OrderPriority", "CustomerOrderSatisfaction", "Region", "Segment"]:
    df[c] = df[c].astype(str).str.strip()

# df.head(30)


In [44]:
# encoded version
df_encoded = df.copy()

# ================== ordinal encoding ==================

# orderpriority: Low < Medium < High
priority_mapping = {"Low": 1, "Medium": 2 ,"High": 3}

# add mapped column
df_encoded["OrderPriority_Encoded"] = df_encoded["OrderPriority"].map(priority_mapping)


# customerordersatisfaction: Very Dissatisfied -> Very Satisfied
satisfaction_mapping = {
    "Very Dissatisfied": 1
    ,"Dissatisfied": 2
    ,"Satisfied": 3
    ,"Very Satisfied": 4
    ,"Prefer not to answer": None
}

# add mapped column
df_encoded["CustomerOrderSatisfaction_Encoded"] = df_encoded["CustomerOrderSatisfaction"].map(satisfaction_mapping)
# df_encoded

# ================== one-hot encoding ==================

df_encoded = pd.get_dummies(
    df_encoded
    ,columns=["Region", "Segment"]
    ,prefix=["Region", "Segment"]
    ,dtype=int
    )

# preview of original/encoded columns
print(df_encoded.head(8)[[
    "OrderPriority", "OrderPriority_Encoded",
    "CustomerOrderSatisfaction", "CustomerOrderSatisfaction_Encoded",
    "Region_Northeast", "Region_Southeast",
    "Segment_Consumer", "Segment_Corporate"
]])



  OrderPriority  OrderPriority_Encoded CustomerOrderSatisfaction  \
0          High                      3                 Satisfied   
1          High                      3                 Satisfied   
2          High                      3                 Satisfied   
3          High                      3                 Satisfied   
4          High                      3                 Satisfied   
5          High                      3                 Satisfied   
6          High                      3                 Satisfied   
7          High                      3                 Satisfied   

   CustomerOrderSatisfaction_Encoded  Region_Northeast  Region_Southeast  \
0                                3.0                 1                 0   
1                                3.0                 1                 0   
2                                3.0                 1                 0   
3                                3.0                 1                 0   
4      

In [42]:
# C1D Export the dataset that includes all encoded variables.
df_encoded.to_csv("Megastore_Encoded.csv", index=False)
print("Exported Megastore_Encoded to csv!")

Exported Megastore_Encoded to csv!


In [26]:
df_encoded.head()

Unnamed: 0,OrderID,ProductName,Quantity,InvoiceDate,UnitPrice,TotalCost,Country,DiscountApplied,OrderPriority,ExpeditedShipping,PaymentMethod,CustomerOrderSatisfaction,OrderPriority_Encoded,CustomerOrderSatisfaction_Encoded,Region_Northeast,Region_Southeast,Segment_Consumer,Segment_Corporate
0,536370,INFLATABLE POLITICAL GLOBE,48,12/1/2010 8:45,$0.85,$40.80,United States,Yes,High,Yes,Credit Card,Satisfied,3,3.0,1,0,0,1
1,536370,SET2 RED RETROSPOT TEA TOWELS,18,12/1/2010 8:45,$2.95,$53.10,United States,Yes,High,Yes,Credit Card,Satisfied,3,3.0,1,0,0,1
2,536370,PANDA AND BUNNIES STICKER SHEET,12,12/1/2010 8:45,$0.85,$10.20,United States,Yes,High,Yes,Credit Card,Satisfied,3,3.0,1,0,0,1
3,536370,RED TOADSTOOL LED NIGHT LIGHT,24,12/1/2010 8:45,$1.65,$39.60,United States,Yes,High,Yes,Credit Card,Satisfied,3,3.0,1,0,0,1
4,536370,VINTAGE HEADS AND TAILS CARD GAME,24,12/1/2010 8:45,$1.25,$30.00,United States,Yes,High,Yes,Credit Card,Satisfied,3,3.0,1,0,0,1


In [37]:
# ================== 
# Section C2: Perform a market basket analysis by completing the following:
# ==================

# a.  Transactionalize the dataset with only the relevant variables for market basket analysis.
tx = df_encoded[["OrderID", "ProductName"]].dropna().drop_duplicates()

print("Transactionalized Data:", tx.shape) # (rows, columns)
print(tx.head())


# b.  Export the transactionalized dataset for market basket analysis with only the relevant variables.
tx.to_csv("Megastore_Transactions.csv", index=False)
print("Transactions dataset saved!")


# c.  Execute the error-free code used to generate association rules with the Apriori algorithm. Provide a screenshot of the top three rules generated by the Apriori algorithm sorted by your chosen metric (i.e., confidence, support, or lift). 
basket = pd.crosstab(tx["OrderID"], tx["ProductName"]).astype(int)

print("Basket shape (orders x products):", basket.shape)
basket.head()

Transactionalized Data: (8197, 2)
   OrderID                         ProductName
0   536370         INFLATABLE POLITICAL GLOBE 
1   536370      SET2 RED RETROSPOT TEA TOWELS 
2   536370     PANDA AND BUNNIES STICKER SHEET
3   536370       RED TOADSTOOL LED NIGHT LIGHT
4   536370  VINTAGE HEADS AND TAILS CARD GAME 
Transactions dataset saved!
Basket shape (orders x products): (441, 1562)


ProductName,50S CHRISTMAS GIFT BAG LARGE,DOLLY GIRL BEAKER,I LOVE LONDON MINI BACKPACK,NINE DRAWER OFFICE TIDY,SET 2 TEA TOWELS I LOVE LONDON,SPACEBOY BABY GIFT SET,TRELLIS COAT RACK,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 EGG HOUSE PAINTED WOOD,...,WRAP VINTAGE PETALS DESIGN,YELLOW COAT RACK PARIS FASHION,YELLOW GIANT GARDEN THERMOMETER,YELLOW SHARK HELICOPTER,ZINC STAR TLIGHT HOLDER,ZINC FOLKART SLEIGH BELLS,ZINC HERB GARDEN CONTAINER,ZINC METAL HEART DECORATION,ZINC TLIGHT HOLDER STAR LARGE,ZINC TLIGHT HOLDER STARS SMALL
OrderID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536370,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536852,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536974,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
537065,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
537463,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
# run Apriori algo
from mlxtend.frequent_patterns import apriori, association_rules

# convert basket to T/F so algo knows whether an item is present in each order
basket_bool = basket > 0

# use Apriori algo to find combos of products that appear in at least 1% of orders 
frequent_itemsets = apriori(basket_bool, min_support=0.01, use_colnames=True)

# generate association rules (If item a is bought, item b is likely bought too)
# only keeping rules with a lift of at least 1.0
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

# sort rules starting from strongest first
rules = rules.sort_values("lift", ascending=False)

# display top 3 rules with the highest lift
# include support and confidence
top3 = rules[["antecedents", "consequents", "support", "confidence", "lift"]].head(3).reset_index(drop=True)
print("=== TOP 3 RULES (Sorted by Lift) ===")
print(top3)
print(top3.to_string(index=False))

=== TOP 3 RULES (Sorted by Lift) ===
                                         antecedents  \
0  (ALARM CLOCK BAKELIKE IVORY, PLASTERS IN TIN S...   
1  (ALARM CLOCK BAKELIKE IVORY, ALARM CLOCK BAKEL...   
2  (SKULL LUNCH BOX WITH CUTLERY , ALARM CLOCK BA...   

                                         consequents   support  confidence  \
0  (PLASTERS IN TIN CIRCUS PARADE , ALARM CLOCK B...  0.011338         1.0   
1  (CHARLOTTE BAG DOLLY GIRL DESIGN, PLASTERS IN ...  0.011338         1.0   
2  (LUNCH BOX WITH CUTLERY RETROSPOT , PLASTERS I...  0.011338         1.0   

   lift  
0  88.2  
1  88.2  
2  88.2  
                                                                                                    antecedents                                                                                                                                         consequents  support  confidence  lift
                                                         (ALARM CLOCK BAKELIKE IVORY, PLASTERS IN 