## Task 2 : Lookalike Model

### **Prerequisites**

In [12]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
customers_df = pd.read_csv('/content/drive/MyDrive/ZeoTap/Customers.csv')
products_df =  pd.read_csv('/content/drive/MyDrive/ZeoTap/Products.csv')
transactions_df = pd.read_csv('/content/drive/MyDrive/ZeoTap/Transactions.csv')

###**Data Preparation**

In [16]:
# Step 1: Prepare customer transaction-based profiles
# Merge transactions with products to get category information
transactions_merged = transactions_df.merge(products_df, on="ProductID", how="left")

# Aggregate transaction data per customer by product categories
category_profiles = transactions_merged.groupby(["CustomerID", "Category"]).size().unstack(fill_value=0)

# Encode regions into numerical vectors using CountVectorizer
vectorizer = CountVectorizer()
region_encoded = vectorizer.fit_transform(customers_df["Region"])

# Convert region encoding to DataFrame and set CustomerID as the index
region_df = pd.DataFrame(region_encoded.toarray(), columns=vectorizer.get_feature_names_out())
region_df["CustomerID"] = customers_df["CustomerID"]
region_df.set_index("CustomerID", inplace=True)

# Merge region data with transaction profiles
customer_profiles = category_profiles.join(region_df, how="left").fillna(0)


### **Computing similarity matrix**

In [17]:
# Step 2: Compute similarity matrix for all customers
similarity_matrix = cosine_similarity(customer_profiles)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles.index, columns=customer_profiles.index)

### **Extrating the results**

In [18]:
# Step 3: Extract top 3 similar customers for C0001–C0020
top_customers = []
target_customers = [f"C{str(i).zfill(4)}" for i in range(1, 21)]

for cust_id in target_customers:
    if cust_id in similarity_df.index:
        # Get the top 3 most similar customers excluding self
        top_similar = similarity_df[cust_id].sort_values(ascending=False).iloc[1:4]
        top_customers.append((cust_id, list(zip(top_similar.index, top_similar.values))))

# Create Lookalike.csv from the results
lookalike_data = {
    "cust_id": [item[0] for item in top_customers],
    "lookalikes": [item[1] for item in top_customers],
}
lookalike_df = pd.DataFrame(lookalike_data)
lookalike_csv_path = "Lookalike.csv"
lookalike_df.to_csv(lookalike_csv_path, index=False)

print("Lookalike recommendations saved to Lookalike.csv")

Lookalike recommendations saved to Lookalike.csv
