# Lookalike Model

In [1]:
import numpy as np
import pandas as pd
import os
import pickle
from datasketch import MinHash, MinHashLSHForest

In [2]:
#set the data folder's path
data_path="data/"

In [3]:
Customers_df=pd.read_csv(os.path.join(data_path,"Customers.csv"))
Products_df=pd.read_csv(os.path.join(data_path,"Products.csv"))
Transactions_df=pd.read_csv(os.path.join(data_path,"Transactions.csv"))

In [4]:
Customers_df.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [5]:
#merge dataframes
Merged=pd.merge(Transactions_df,Customers_df, on="CustomerID")
Merged=pd.merge(Merged,Products_df,on="ProductID")
Merged.to_csv("Merged_data.csv",index=False)

## Data Preprocessing

In [7]:
#Relevant features for modeling
data = Merged[["CustomerID", "Region", "Category", "Quantity", "TotalValue"]].copy()

data.loc[:, "Region"] = data["Region"].astype("category").cat.codes
data.loc[:, "Category"] = data["Category"].astype("category").cat.codes

# Aggregate data by CustomerID
aggregated = data.groupby("CustomerID").agg({"Region": "first","Category": "mean","Quantity": "sum","TotalValue": "sum"}).reset_index()

# Save the aggregated data
aggregated.to_csv("Final_data.csv", index=False)


## Train the model

In [9]:
# Initialize MinHashLSHForest
lsh = MinHashLSHForest(num_perm=126)

def create_minhash(row):
    m = MinHash(num_perm=126)
    for value in row:
        m.update(str(value).encode('utf-8'))
    return m

aggregated["MinHash"] = aggregated.apply(
    lambda row: create_minhash(row[1:].values), axis=1
)

# Adding CustomersID to LSH forest
for idx, (customer_id, minhash) in enumerate(zip(aggregated["CustomerID"], aggregated["MinHash"])):
    lsh.add(customer_id, minhash)

lsh.index()

with open("lookalike_model.pkl", "wb") as f:
    pickle.dump(lsh, f)


## Finding Top 3 similar customers

In [11]:
with open("lookalike_model.pkl", "rb") as f:
    lsh = pickle.load(f)

In [12]:
def fun_lookalikes(customer_id, lsh, top_n=3):
    customer_row = aggregated[aggregated["CustomerID"] == customer_id]
    if customer_row.empty:
        return []
    minhash = customer_row["MinHash"].values[0]
    return lsh.query(minhash, top_n)

In [13]:
lookalike = {}
for customer_id in aggregated["CustomerID"][:20]:
    similar_customers = fun_lookalikes(customer_id, lsh, top_n=3)
    scores = [1.0 / (i + 1) for i in range(len(similar_customers))] 
    lookalike[customer_id] = list(zip(similar_customers, scores))

In [16]:
output = []
for cust_id, similar_list in lookalike.items():
    for sim_cust, score in similar_list:
        output.append({"CustomerID": cust_id, "SimilarCustomerID": sim_cust, "Score": score})

lookalike_df = pd.DataFrame(output)
lookalike_df.to_csv("Lakshmi_Mohan_Lookalike.csv", index=False)