In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# ---------------------- 1️⃣ LOAD DATA ---------------------- #
# Read the datasets
customers = pd.read_csv("../data/Customers.csv")
transactions = pd.read_csv("../data/Transactions.csv")

# Convert transaction date to datetime
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# ---------------------- 2️⃣ MERGE CUSTOMER & TRANSACTION DATA ---------------------- #
# Aggregate transaction data per customer
customer_txn = transactions.groupby("CustomerID").agg(
    Total_Spend=('TotalValue', 'sum'),  # Total amount spent
    Avg_Spend=('TotalValue', 'mean'),   # Average spend per transaction
    Num_Transactions=('TransactionID', 'count'),  # Total transactions
    Total_Quantity=('Quantity', 'sum')  # Total products bought
).reset_index()

# Merge customer details
df = pd.merge(customers, customer_txn, on="CustomerID", how="left").fillna(0)

# ---------------------- 3️⃣ FEATURE ENGINEERING ---------------------- #
# Drop non-numeric columns (like CustomerName) before similarity calculation
df = df.drop(columns=['CustomerName'])

# Encode categorical data (Region) using one-hot encoding
df = pd.get_dummies(df, columns=['Region'], drop_first=True)

# Select only numeric features
features = ['Total_Spend', 'Avg_Spend', 'Num_Transactions', 'Total_Quantity']
df_numeric = df[features]

# Scale numerical features
scaler = StandardScaler()
df_numeric = scaler.fit_transform(df_numeric)

# Convert back to DataFrame
df_numeric = pd.DataFrame(df_numeric, index=df.index, columns=features)

# ---------------------- 4️⃣ COMPUTE SIMILARITY ---------------------- #
# Compute Cosine Similarity
similarity_matrix = cosine_similarity(df_numeric)

# Convert similarity matrix into a DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=df.index, columns=df.index)

# ---------------------- 5️⃣ FIND TOP 3 LOOKALIKES ---------------------- #
# Function to get top 3 similar customers
def get_top_lookalikes(customer_id, top_n=3):
    if customer_id in similarity_df.index:
        sim_scores = similarity_df[customer_id].drop(customer_id)  # Remove self similarity
        return sim_scores.nlargest(top_n).reset_index().values.tolist()
    else:
        return []  # Return empty if customer is not found

# Find lookalikes for the first 20 customers (C0001 - C0020)
lookalike_dict = {}
for customer_id in df.index[:20]:  # First 20 customers
    lookalikes = get_top_lookalikes(customer_id)
    lookalike_dict[customer_id] = lookalikes

# ---------------------- 6️⃣ SAVE OUTPUT TO CSV ---------------------- #
# Convert dictionary to DataFrame
lookalike_list = []
for cust_id, lookalikes in lookalike_dict.items():
    for similar_cust, score in lookalikes:
        lookalike_list.append([cust_id, similar_cust, round(score, 4)])

lookalike_df = pd.DataFrame(lookalike_list, columns=["CustomerID", "LookalikeID", "SimilarityScore"])
lookalike_df.to_csv("../output/Karan_Punwatkar_Lookalike.csv", index=False)

print("✅ Lookalike Model Completed! File saved as Karan_Punwatkar_Lookalike.csv")


✅ Lookalike Model Completed! File saved as Karan_Punwatkar_Lookalike.csv
