In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

In [22]:
customers_df["SignupDate"] = pd.to_datetime(customers_df["SignupDate"], errors="coerce")
transactions_df["TransactionDate"] = pd.to_datetime(transactions_df["TransactionDate"], errors="coerce")

In [24]:
customer_transactions = transactions_df.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    avg_spent=("TotalValue", "mean"),
    num_transactions=("TransactionID", "count"),
    distinct_products=("ProductID", "nunique")
).reset_index()

In [8]:
customer_features = customers_df.merge(customer_transactions, on="CustomerID", how="left")

In [9]:
customer_features.fillna(0, inplace=True)

In [10]:
customer_features = pd.get_dummies(customer_features, columns=["Region"], drop_first=True)

In [11]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop(columns=["CustomerID", "CustomerName", "SignupDate"]))

In [12]:
similarity_matrix = cosine_similarity(scaled_features)

In [13]:
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features["CustomerID"], columns=customer_features["CustomerID"])

In [25]:
def get_top_similar_customers(customer_id, top_n=3):
    if customer_id in similarity_df.index:
        similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:top_n+1]
        return list(zip(similar_customers.index, similar_customers.values))
    return []

In [15]:
lookalike_dict = {cust_id: get_top_similar_customers(cust_id) for cust_id in customer_features["CustomerID"][:20]}

In [18]:
lookalike_list = []
for cust_id, similar_list in lookalike_dict.items():
    entry = [cust_id]
    for sim_cust in similar_list:
        entry.extend(sim_cust)  # Add Customer ID and Score
    while len(entry) < 7:  # Ensure each row has exactly 7 elements (CustomerID + 3 pairs)
        entry.extend(["", ""])  # Fill missing values
    lookalike_list.append(entry)

In [19]:
lookalike_df = pd.DataFrame(lookalike_list, columns=["CustomerID", "Lookalike1", "Score1", "Lookalike2", "Score2", "Lookalike3", "Score3"])

In [33]:
lookalike_csv_path = 'C:/Users/DELL/Documents/zeotab challange/Hrishikesh_Jaiswal_Lookalike.csv'
lookalike_df.to_csv(lookalike_csv_path, index=False)

lookalike_df.head()

Unnamed: 0,CustomerID,Lookalike1,Score1,Lookalike2,Score2,Lookalike3,Score3
0,C0001,C0137,0.999922,C0152,0.999841,C0107,0.988338
1,C0002,C0142,0.976174,C0043,0.97109,C0128,0.918808
2,C0003,C0133,0.995125,C0052,0.96715,C0112,0.949261
3,C0004,C0108,0.981744,C0113,0.979237,C0155,0.968232
4,C0005,C0159,0.999513,C0123,0.984038,C0186,0.977956
