In [1]:
# Imports
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np

In [2]:
# Step 1: Load Data
customers = pd.read_csv("/content/Customers.csv")
products = pd.read_csv("/content/Products.csv")
transactions = pd.read_csv("/content/Transactions.csv")

In [3]:
# Step 2: Preprocess Data
# Merge datasets to get a unified view
merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

In [4]:
# Step 3: Feature Engineering
# Aggregate features for each customer
customer_features = merged_data.groupby("CustomerID").agg(
    total_spend=("TotalValue", "sum"),
    avg_spend=("TotalValue", "mean"),
    transaction_count=("TransactionID", "count"),
    preferred_category=("Category", lambda x: x.mode()[0]),
    region=("Region", "first")
).reset_index()

# Encode categorical features (Region and Preferred Category)
categorical_features = ["preferred_category", "region"]
ohe = OneHotEncoder()
categorical_encoded = ohe.fit_transform(customer_features[categorical_features]).toarray()

# Standardize numerical features (total_spend, avg_spend, transaction_count)
numerical_features = ["total_spend", "avg_spend", "transaction_count"]
scaler = StandardScaler()
numerical_scaled = scaler.fit_transform(customer_features[numerical_features])

# Combine numerical and categorical features
final_features = np.hstack((numerical_scaled, categorical_encoded))

In [5]:
# Step 4: Compute Similarity
# Compute cosine similarity between all customers
similarity_matrix = cosine_similarity(final_features)

In [6]:
# Step 5: Generate Lookalike Recommendations
lookalike_map = {}
for idx, customer_id in enumerate(customer_features["CustomerID"]):
    # Get similarity scores for the current customer
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    # Exclude the customer itself and sort by similarity
    similarity_scores = sorted(
        [(i, score) for i, score in similarity_scores if i != idx], key=lambda x: x[1], reverse=True
    )
    # Get top 3 similar customers
    top_3 = [(customer_features["CustomerID"].iloc[i], score) for i, score in similarity_scores[:3]]
    lookalike_map[customer_id] = top_3

In [7]:
# Step 6: Save Results
# Prepare the Lookalike.csv
lookalike_df = pd.DataFrame([
    {"cust_id": cust_id, "similar_customers": top_3}
    for cust_id, top_3 in lookalike_map.items()
])
lookalike_df.to_csv("Kanishk_Jha_Lookalike.csv", index=False)

In [8]:
# Step 7: Evaluation
# Check recommendations for the first 20 customers
def evaluate_lookalikes(customer_ids, lookalike_map):
    for cust_id in customer_ids:
        print(f"Customer ID: {cust_id}")
        print("Top 3 Lookalikes:")
        for similar_cust, score in lookalike_map.get(cust_id, []):
            print(f"  {similar_cust} with similarity score: {score:.2f}")
        print("\n")

# Evaluate for customers C0001 to C0020
evaluate_lookalikes(customer_features["CustomerID"].iloc[:20], lookalike_map)

Customer ID: C0001
Top 3 Lookalikes:
  C0190 with similarity score: 0.97
  C0048 with similarity score: 0.94
  C0181 with similarity score: 0.91


Customer ID: C0002
Top 3 Lookalikes:
  C0088 with similarity score: 0.97
  C0134 with similarity score: 0.94
  C0106 with similarity score: 0.90


Customer ID: C0003
Top 3 Lookalikes:
  C0052 with similarity score: 0.98
  C0152 with similarity score: 0.93
  C0031 with similarity score: 0.89


Customer ID: C0004
Top 3 Lookalikes:
  C0165 with similarity score: 0.97
  C0155 with similarity score: 0.96
  C0169 with similarity score: 0.89


Customer ID: C0005
Top 3 Lookalikes:
  C0186 with similarity score: 0.98
  C0146 with similarity score: 0.96
  C0007 with similarity score: 0.90


Customer ID: C0006
Top 3 Lookalikes:
  C0168 with similarity score: 0.97
  C0171 with similarity score: 0.95
  C0187 with similarity score: 0.94


Customer ID: C0007
Top 3 Lookalikes:
  C0140 with similarity score: 0.98
  C0115 with similarity score: 0.93
  C0005 w