In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Tuple, Union

In [2]:
# Root directory of the project
os.chdir('/Users/prajwal/Developer/assessment')

In [3]:
# Load datasets
customers = pd.read_csv("Zeotap/Data/Customers.csv")
products = pd.read_csv("Zeotap/Data/Products.csv")
transactions = pd.read_csv("Zeotap/Data/Transactions.csv")

In [4]:
# Display's the information of the datasets
customers.info()
products.info()
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CustomerID    200 non-null    object
 1   CustomerName  200 non-null    object
 2   Region        200 non-null    object
 3   SignupDate    200 non-null    object
dtypes: object(4)
memory usage: 6.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ProductID    100 non-null    object 
 1   ProductName  100 non-null    object 
 2   Category     100 non-null    object 
 3   Price        100 non-null    float64
dtypes: float64(1), object(3)
memory usage: 3.2+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  


In [5]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [6]:
latest_date = transactions['TransactionDate'].max()

# Feature Engineering

1. Customer Demographic Features

In [7]:
# days since customer signed up 
customers['Tenure'] = (latest_date - customers['SignupDate']).dt.days

In [8]:
# Calculate days since last transaction
last_purchase = transactions.groupby('CustomerID')['TransactionDate'].max().reset_index()
last_purchase.columns = ['CustomerID', 'LastPurchaseDate']
customers = customers.merge(last_purchase, on='CustomerID', how='left')

In [9]:
customers.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,Tenure,LastPurchaseDate
0,C0001,Lawrence Carroll,South America,2022-07-10,902,2024-11-02 17:04:16
1,C0002,Elizabeth Lutz,Asia,2022-02-13,1049,2024-12-03 01:41:41
2,C0003,Michael Rivera,South America,2024-03-07,296,2024-08-24 18:54:04
3,C0004,Kathleen Rodriguez,South America,2022-10-09,811,2024-12-23 14:13:52
4,C0005,Laura Weber,Asia,2022-08-15,866,2024-11-04 00:30:22


In [10]:
customers['Recency'] = (latest_date - customers['LastPurchaseDate']).dt.days
customers['Recency'] = customers['Recency'].fillna(customers['Tenure'])  # For customers with no transactions

In [11]:
customers = pd.get_dummies(customers, columns=['Region'], prefix='Region')
customers.head()

Unnamed: 0,CustomerID,CustomerName,SignupDate,Tenure,LastPurchaseDate,Recency,Region_Asia,Region_Europe,Region_North America,Region_South America
0,C0001,Lawrence Carroll,2022-07-10,902,2024-11-02 17:04:16,55.0,False,False,False,True
1,C0002,Elizabeth Lutz,2022-02-13,1049,2024-12-03 01:41:41,25.0,True,False,False,False
2,C0003,Michael Rivera,2024-03-07,296,2024-08-24 18:54:04,125.0,False,False,False,True
3,C0004,Kathleen Rodriguez,2022-10-09,811,2024-12-23 14:13:52,4.0,False,False,False,True
4,C0005,Laura Weber,2022-08-15,866,2024-11-04 00:30:22,54.0,True,False,False,False


2. Transaction Behavioral Features

In [12]:
# Merge datasets
merged_data = transactions.merge(products, on='ProductID', how='left')
merged_data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [13]:
transaction_metrics = merged_data.groupby('CustomerID').agg(
    total_transactions=('TransactionID', 'count'),
    total_quantity=('Quantity', 'sum'),
    total_spent=('TotalValue', 'sum'),
    unique_products=('ProductID', 'nunique'),
    unique_categories=('Category', 'nunique')
).reset_index()

transaction_metrics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CustomerID          199 non-null    object 
 1   total_transactions  199 non-null    int64  
 2   total_quantity      199 non-null    int64  
 3   total_spent         199 non-null    float64
 4   unique_products     199 non-null    int64  
 5   unique_categories   199 non-null    int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 9.5+ KB


In [14]:
# Basic transaction metrics
transaction_metrics['avg_transaction_value'] = transaction_metrics['total_spent'] / transaction_metrics['total_transactions']

# Category preferences
category_spending = merged_data.pivot_table(
    index='CustomerID',
    columns='Category',
    values='TotalValue',
    aggfunc='sum',
    fill_value=0
).add_prefix('Category_')

3. Combine All Features

In [15]:
# Merge all features
customer_features = customers.merge(transaction_metrics, on='CustomerID', how='left')
customer_features = customer_features.merge(category_spending, on='CustomerID', how='left')
customer_features.head()

Unnamed: 0,CustomerID,CustomerName,SignupDate,Tenure,LastPurchaseDate,Recency,Region_Asia,Region_Europe,Region_North America,Region_South America,total_transactions,total_quantity,total_spent,unique_products,unique_categories,avg_transaction_value,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor
0,C0001,Lawrence Carroll,2022-07-10,902,2024-11-02 17:04:16,55.0,False,False,False,True,5.0,12.0,3354.52,5.0,3.0,670.904,114.6,0.0,2827.3,412.62
1,C0002,Elizabeth Lutz,2022-02-13,1049,2024-12-03 01:41:41,25.0,True,False,False,False,4.0,10.0,1862.74,4.0,2.0,465.685,0.0,1025.46,0.0,837.28
2,C0003,Michael Rivera,2024-03-07,296,2024-08-24 18:54:04,125.0,False,False,False,True,4.0,14.0,2725.38,4.0,3.0,681.345,0.0,122.36,1385.2,1217.82
3,C0004,Kathleen Rodriguez,2022-10-09,811,2024-12-23 14:13:52,4.0,False,False,False,True,8.0,23.0,5354.88,8.0,3.0,669.36,1888.48,0.0,1355.74,2110.66
4,C0005,Laura Weber,2022-08-15,866,2024-11-04 00:30:22,54.0,True,False,False,False,3.0,7.0,2034.24,3.0,2.0,678.08,0.0,0.0,1180.38,853.86


In [16]:
customer_features.isnull().sum()

CustomerID               0
CustomerName             0
SignupDate               0
Tenure                   0
LastPurchaseDate         1
Recency                  0
Region_Asia              0
Region_Europe            0
Region_North America     0
Region_South America     0
total_transactions       1
total_quantity           1
total_spent              1
unique_products          1
unique_categories        1
avg_transaction_value    1
Category_Books           1
Category_Clothing        1
Category_Electronics     1
Category_Home Decor      1
dtype: int64

In [17]:
# Fill NA values for customers with no transactions
transaction_features = ['total_transactions', 'total_quantity', 'total_spent', 
                       'unique_products', 'unique_categories', 'avg_transaction_value']
customer_features[transaction_features] = customer_features[transaction_features].fillna(0)
customer_features[category_spending.columns] = customer_features[category_spending.columns].fillna(0)

4. Feature Scaling

In [18]:
# Select features for scaling
features_to_scale = ['Tenure', 'Recency'] + transaction_features + list(category_spending.columns)
region_features = [col for col in customer_features.columns if col.startswith('Region_')]

# Scale numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features[features_to_scale])

# Create final feature matrix
final_features = pd.DataFrame(
    np.hstack([scaled_features, customer_features[region_features]]),
    columns=features_to_scale + region_features
)

In [19]:
final_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Tenure                 200 non-null    float64
 1   Recency                200 non-null    float64
 2   total_transactions     200 non-null    float64
 3   total_quantity         200 non-null    float64
 4   total_spent            200 non-null    float64
 5   unique_products        200 non-null    float64
 6   unique_categories      200 non-null    float64
 7   avg_transaction_value  200 non-null    float64
 8   Category_Books         200 non-null    float64
 9   Category_Clothing      200 non-null    float64
 10  Category_Electronics   200 non-null    float64
 11  Category_Home Decor    200 non-null    float64
 12  Region_Asia            200 non-null    float64
 13  Region_Europe          200 non-null    float64
 14  Region_North America   200 non-null    float64
 15  Region

6. Generate Recommendations

In [20]:
def get_lookalikes(customer_ids, similarity_matrix, all_customers, n=3):
    lookalike_map = {}
    for target_id in customer_ids:
        try:
            idx = all_customers.index(target_id)
            scores = list(enumerate(similarity_matrix[idx]))
            sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:n+1]
            lookalike_map[target_id] = [(all_customers[i], round(score, 4)) for i, score in sorted_scores]
        except ValueError:
            lookalike_map[target_id] = []
    return lookalike_map

In [21]:
# Process first 20 customers
all_customer_ids = customer_features['CustomerID'].tolist()
target_customers = [f'C{str(i).zfill(4)}' for i in range(1, 21)]

# Calculate similarity matrix
similarity_matrix = cosine_similarity(final_features)

lookalike_recommendations = get_lookalikes(target_customers, similarity_matrix, all_customer_ids)

In [22]:
output_data = []
for target in target_customers:
    recommendations = lookalike_recommendations.get(target, [])
    row = {'CustomerID': target}
    for i, (cust_id, score) in enumerate(recommendations[:3], 1):
        row[f'Lookalike{i}_ID'] = cust_id
        row[f'Lookalike{i}_Score'] = score

    while len(recommendations) < 3:
        i = len(recommendations) + 1
        row[f'Lookalike{i}_ID'] = ''
        row[f'Lookalike{i}_Score'] = ''
        recommendations.append(None)
    output_data.append(row)


lookalike_df = pd.DataFrame(output_data)
lookalike_df = lookalike_df[[
    'CustomerID',
    'Lookalike1_ID', 'Lookalike1_Score',
    'Lookalike2_ID', 'Lookalike2_Score', 
    'Lookalike3_ID', 'Lookalike3_Score'
]]


output_dir = 'Zeotap/Data/OutPut'
os.makedirs(output_dir, exist_ok=True)

lookalike_df.to_csv(f'{output_dir}/Enhanced_Lookalike.csv', index=False)