In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import fasttext
from gensim.models import Word2Vec
from collections import Counter



In [2]:
assortment_df = pd.read_csv("data/assortment.csv",
                            dtype={"product_id": str,
                                   "product_group": str,
                                   })

orders_df = pd.read_csv("data/orders.csv",
                        parse_dates=['datetime'])

In [3]:
# Filter orders with more than one item
multi_item_orders = orders_df.groupby('order_id').filter(lambda x: len(x) > 1)

# Create a list of concatenated product IDs for each order
order_product_lists = multi_item_orders.groupby('order_id')['product_id'].apply(lambda x: ' '.join(x)).tolist()

# Create a text file with concatenated product IDs (each line is an order)
with open('order_product_lists.txt', 'w') as f:
    for order_product_list in order_product_lists:
        f.write(order_product_list + '\n')


In [4]:
# Train FastText model on concatenated product IDs
model = fasttext.train_unsupervised(
    'order_product_lists.txt',
    model='skipgram',
    dim=5,
    loss='ns',  # Specify the loss function as 'ns' (negative sampling)
    lr=0.1,     # Learning rate
    word_ngrams=1,  # Adjust word n-grams if needed
    min_count=1,    # Minimum word frequency
    epoch=100,        # Number of training epochs
    bucket=200000,  # Number of buckets for hashing
    thread=4,       # Number of threads for training
    ws=5,           # Window size
    neg=5,          # Number of negative samples
    t=1e-4,         # Subsampling threshold
    verbose=1,      # Set to 0 for less output
    minn=0,         # Minimum length of char ngrams
    maxn=0,         # Maximum length of char ngrams
)

Read 0M words
Number of words:  1001
Number of labels: 0
Progress: 100.0% words/sec/thread: 1696742 lr:  0.000000 avg.loss:  2.694177 ETA:   0h 0m 0s


In [5]:
# Calculate total sales for each product
total_sales = orders_df.groupby('product_id')['order_id'].count().reset_index()
total_sales.rename(columns={'order_id': 'total_sales'}, inplace=True)
total_sales['product_id'] = total_sales['product_id'].astype(str)


# Calculate co-occurrence frequency of product pairs
def calculate_cooccurrence_frequency(orders_data):
    cooccurrence_matrix = pd.crosstab(orders_data['order_id'], orders_data['product_id'])
    cooccurrence_frequency = cooccurrence_matrix.T.dot(cooccurrence_matrix)
    return cooccurrence_frequency

cooccurrence_frequency = calculate_cooccurrence_frequency(orders_df)

In [6]:
#Find similar products
def find_similar_products(product_id, num_similar=30):
    #similar_products = model.wv.most_similar(product_id, topn=num_similar)
    similar_products = model.get_nearest_neighbors(product_id, k=num_similar)

    # Create a DataFrame to store the results
    similar_products_df = pd.DataFrame(similar_products, columns=['similarity_score','product_id'])
    
    # Merge total sales information
    similar_products_df = similar_products_df.merge(total_sales, on='product_id', how='left')
    
    # Merge product features from the assortment data
    similar_products_df = similar_products_df.merge(assortment_df, on='product_id', how='left')


    # Calculate co-occurrence frequency for each pair of products
    cooccurrence_frequency_pairs = []
    for similar_product_id in similar_products_df['product_id']:
        if product_id != similar_product_id:
            cooccurrence_frequency_pair = cooccurrence_frequency.loc[product_id, similar_product_id]
        else:
            cooccurrence_frequency_pair = 0
        cooccurrence_frequency_pairs.append(cooccurrence_frequency_pair)
    
    similar_products_df['cooccurrence_frequency'] = cooccurrence_frequency_pairs

    
    return similar_products_df



In [7]:
# Find similar products for a specific product (replace 'product_id' with the desired product)
similar_products = find_similar_products('32d18c3adab74f3')

similar_products

Unnamed: 0,similarity_score,product_id,total_sales,product_group,size_group,cluster,cooccurrence_frequency
0,0.995692,0bdacc4bb9c84f0,591,baby_toys,S,toys,3
1,0.990505,96f5fa1cd4ff4c7,582,dolls,M,toys,4
2,0.987995,53dae650df7042f,586,baby_toys,L,toys,1
3,0.985307,abdc306988a94b1,582,baby_toys,M,toys,3
4,0.983668,67a61e43d1bd4af,540,baby_toys,L,toys,1
5,0.983626,fec55667a72041a,622,board_games,S,toys,2
6,0.983508,8eaaa60a9bbb479,620,puzzles,S,toys,3
7,0.983185,36103fff52e640a,630,puzzles,L,toys,1
8,0.982332,c5959fd89a274bf,582,baby_toys,M,toys,4
9,0.981872,0a973320c70f4c2,603,baby_toys,L,toys,3


In [8]:
#Calculate embeddings for the most sold 100 items
most_sold_100_items = total_sales.nlargest(100, 'total_sales')['product_id']
most_sold_100_embeddings = [model[product_id] for product_id in most_sold_100_items]

#Calculate embeddings for all items
all_items = orders_df['product_id'].unique()
all_item_embeddings = [model[product_id] for product_id in all_items]

#Compute similarity scores
similarity_scores = cosine_similarity(most_sold_100_embeddings, all_item_embeddings)

#Find the most similar 250 items
num_similar_items = 250  # Adjust as needed
similar_items_indices = similarity_scores.argsort(axis=1)[:, :-num_similar_items-1:-1]
similar_items = [all_items[indices] for indices in similar_items_indices]

#Calculate the frequency of each item in the list of similar items
similar_items_flat = [item for sublist in similar_items for item in sublist if item not in most_sold_100_items ]
item_frequencies = Counter(similar_items_flat)

#Sort items by frequency in descending order
sorted_items_by_frequency = sorted(item_frequencies.items(), key=lambda x: x[1], reverse=True)

#Take the top 500 items and add them to the most sold 100 items list
top_500_items = [item[0] for item in sorted_items_by_frequency[:500]]
updated_most_sold_100_items = pd.concat([most_sold_100_items, pd.Series(top_500_items)])

# Display the updated list of most sold 100 items
print(updated_most_sold_100_items)

13     03b755d783fd4ec
397    62e3040b6322463
434    6b91277c742e439
222    36103fff52e640a
575    93210367755d441
            ...       
495    8957c218f8804aa
496    11fda8271e88476
497    5f3f35e723c4411
498    705c78ac2201440
499    d4db8e0c77864ae
Length: 600, dtype: object


In [9]:
# Define the file paths for saving the CSV files
most_sold_items_file = 'most_sold_items.csv'
remaining_items_file = 'remaining_items.csv'

# Create a DataFrame for the most sold items
most_sold_items_df = pd.DataFrame({'product_id': updated_most_sold_100_items})

# Merge the total sales information for the most sold items
most_sold_items_df = most_sold_items_df.merge(total_sales, on='product_id', how='left')

# Create a DataFrame for the remaining items (excluding most sold items)
remaining_items_df = total_sales[~total_sales['product_id'].isin(updated_most_sold_100_items)]  
remaining_items_df = pd.concat([remaining_items_df,total_sales[total_sales['product_id'].isin(most_sold_100_items)]], ignore_index=True)
 
# Save the most sold items (including total sales info) to a CSV file
most_sold_items_df.to_csv(most_sold_items_file, index=False)

# Save the remaining items to a CSV file
remaining_items_df.to_csv(remaining_items_file, index=False)

print(f"Most Sold Items (including Total Sales Info) saved to {most_sold_items_file}")
print(f"Remaining Items saved to {remaining_items_file}")

Most Sold Items (including Total Sales Info) saved to most_sold_items.csv
Remaining Items saved to remaining_items.csv


In [10]:
# Calculate the total sales of the items in the updated most sold items list
total_sales_most_sold_items = total_sales[total_sales['product_id'].isin(remaining_items_df["product_id"])]

# Calculate the total sales of all items in your dataset
total_sales_all_items = total_sales['total_sales'].sum()

# Calculate the sale percentage
sale_percentage = (total_sales_most_sold_items['total_sales'].sum() / total_sales_all_items) * 100

print(f"Sale Percentage of Remaining Items among All Sales: {sale_percentage:.2f}%")

Sale Percentage of Remaining Items among All Sales: 59.97%
