In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd 

# Merge Customers and Transactions datasets

In [4]:
customers = pd.read_csv(r'C:\Users\hansi\OneDrive\Desktop\zeotap_assessment\ecommerce_analysis\data\Customers.csv')
products = pd.read_csv(r'C:\Users\hansi\OneDrive\Desktop\zeotap_assessment\ecommerce_analysis\data\Products.csv')
transactions = pd.read_csv(r'C:\Users\hansi\OneDrive\Desktop\zeotap_assessment\ecommerce_analysis\data\Transactions.csv')

# Merge transactions and customers on the 'CustomerID' column

In [5]:
merged_data = pd.merge(transactions, customers, on='CustomerID')

# Aggregate transaction data by CustomerID

In [6]:
customer_data = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum'
}).reset_index()

# Scale data for similarity computation

In [7]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(customer_data[['TotalValue', 'Quantity']])

# Cosine similarity


In [8]:
similarity_matrix = cosine_similarity(scaled_data)

# Display similarity for first customer


In [9]:
print(similarity_matrix[0])

[ 1.          0.80759386  0.0024998  -0.99635373  0.97398855 -0.18169128
  0.99576195 -0.99376286  0.96731259  0.56045711 -0.67852285 -0.9582175
 -0.8528416   0.95270289  0.96716372  0.59827785 -0.99990214 -0.97352068
  0.75410701  0.96857443 -0.86421264 -0.8317224  -0.63273453 -0.99799149
  0.89743541  0.68271277  0.65181286 -0.95009054  0.79643377  0.37716015
  0.82981037  0.9162629   0.9435917  -0.71806268  0.93833759  0.93040255
 -0.79028818  0.36279815 -0.99995382  0.52498453 -0.96828011  0.99982157
  0.92416407  0.94587546 -0.97403204 -0.92445033  0.33663176  0.05855626
 -0.35667406  0.98131842 -0.76918054  0.91863182 -0.9208913  -0.8915188
  0.73041165  0.90582798 -0.93358911  0.9684147  -0.87551968  0.94109063
  0.99328849  0.90307232  0.96446421 -0.56312358 -0.95191079  0.72688039
 -0.84523315 -0.96504969  0.98800217  0.86560076  0.91688914  0.30450726
  0.83893858  0.98298994 -0.99989291  0.7825412   0.86511753  0.98215591
 -0.18983147  0.9726206  -0.96399101 -0.89469069  0.9

# Function to find top 3 similar customers

In [10]:
def find_top_lookalikes(customer_id, similarity_matrix, customer_ids):
    idx = customer_ids.index(customer_id)
    similar_indices = np.argsort(similarity_matrix[idx])[::-1][1:4]
    return [(customer_ids[i], similarity_matrix[idx][i]) for i in similar_indices]

# Example for CustomerID C0001

In [11]:
customer_ids = customer_data['CustomerID'].tolist()
lookalikes = find_top_lookalikes('C0001', similarity_matrix, customer_ids)
print('Lookalikes for C0001:', lookalikes)

Lookalikes for C0001: [('C0085', np.float64(0.9999990504724361)), ('C0042', np.float64(0.9998215747742084)), ('C0089', np.float64(0.9997850140987701))]


In [12]:
lookalike_results = {}
for customer_id in customer_ids[:20]:
    lookalike_results[customer_id] = find_top_lookalikes(customer_id, similarity_matrix, customer_ids)

# Save to CSV

In [14]:
import os
import csv

# Ensure the output directory exists
os.makedirs('output', exist_ok=True)

# Write the lookalike data to a CSV file
with open('output/Lookalike.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['CustomerID', 'Lookalikes'])
    for customer_id, lookalikes in lookalike_results.items():
        writer.writerow([customer_id, lookalikes])