In [44]:
import pandas as pd
from surprise import Dataset, Reader
from surprise import KNNBasic
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Load the datasets
orders = pd.read_csv('../dataset/brazil/olist_orders_dataset.csv')
order_items = pd.read_csv('../dataset/brazil/olist_order_items_dataset.csv')
products = pd.read_csv('../dataset/brazil/olist_products_dataset.csv')
translations = pd.read_csv('../dataset/brazil/product_category_name_translation.csv')
review = pd.read_csv('../dataset/brazil/olist_order_reviews_dataset.csv')

# 1. Join tables
data = orders.merge(order_items, on='order_id')
data = data.merge(products, on='product_id')
data = data.merge(translations, on='product_category_name')
data = data.merge(review, on='order_id')

# 2. Translate product category names to English and filter orders with 2 or more items
order_counts = data['order_id'].value_counts()
multi_item_orders = order_counts[order_counts >= 2].index
filtered_data = data[data['order_id'].isin(multi_item_orders)]
filtered_data.columns


# 3. Collaborative filtering to build a recommendation system
reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(filtered_data[['customer_id', 'product_id', 'review_score']], reader)
trainset = dataset.build_full_trainset()
sim_options = {'name': 'cosine', 'user_based': False}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

# 4. Logic to predict the second purchased item
# Generating frequent itemsets and association rules
# ... rest of the code

# One-hot encode 'product_category_name_english' column
basket = (filtered_data.groupby(['order_id', 'product_category_name_english'])['order_item_id']
          .count().unstack().reset_index().fillna(0).set_index('order_id'))

# Convert the units to 1 hot encoded values
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = basket.applymap(encode_units)

# Apply apriori algorithm
frequent_itemsets = apriori(basket_sets, min_support=0.0001, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=0.01)

# get the list of unique product categories
category_list = filtered_data['product_category_name_english'].unique()

# For a given first purchased item, predict the second purchased item
first_item = "books_general_interest"
recommended_items = rules[rules['antecedents'].apply(lambda x: first_item in x)]
recommended_items = recommended_items.sort_values('confidence', ascending=False)

if not recommended_items.empty:
    second_item = recommended_items['consequents'].iloc[0]
    print(f'For the first purchased item: {first_item}, the recommended second purchased item is: {second_item}')
else:
    print(f'No recommendations available for the first purchased item: {first_item}')


Computing the cosine similarity matrix...
Done computing similarity matrix.
For the first purchased item: books_general_interest, the recommended second purchased item is: frozenset({'market_place'})




In [43]:
# make unique category list
category_list = filtered_data['product_category_name_english'].unique()
category_list


array(['housewares', 'cool_stuff', 'furniture_decor',
       'industry_commerce_and_business', 'bed_bath_table', 'food_drink',
       'computers_accessories', 'garden_tools', 'stationery',
       'watches_gifts', 'health_beauty',
       'kitchen_dining_laundry_garden_furniture', 'auto', 'perfumery',
       'drinks', 'sports_leisure', 'market_place', 'baby',
       'consoles_games', 'construction_tools_construction',
       'home_construction', 'small_appliances', 'books_general_interest',
       'electronics', 'dvds_blu_ray', 'home_confort', 'toys', 'pet_shop',
       'music', 'telephony', 'christmas_supplies', 'luggage_accessories',
       'home_comfort_2', 'fashion_bags_accessories',
       'musical_instruments', 'food', 'costruction_tools_garden', 'art',
       'furniture_living_room', 'construction_tools_lights',
       'furniture_bedroom', 'office_furniture', 'computers',
       'books_technical', 'flowers', 'arts_and_craftmanship',
       'air_conditioning', 'tablets_printing_ima