In [3]:
import pandas as pd

customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

customers_df.head(), products_df.head(), transactions_df.head()


(  CustomerID        CustomerName         Region  SignupDate
 0      C0001    Lawrence Carroll  South America  2022-07-10
 1      C0002      Elizabeth Lutz           Asia  2022-02-13
 2      C0003      Michael Rivera  South America  2024-03-07
 3      C0004  Kathleen Rodriguez  South America  2022-10-09
 4      C0005         Laura Weber           Asia  2022-08-15,
   ProductID              ProductName     Category   Price
 0      P001     ActiveWear Biography        Books  169.30
 1      P002    ActiveWear Smartwatch  Electronics  346.30
 2      P003  ComfortLiving Biography        Books   44.12
 3      P004            BookWorld Rug   Home Decor   95.69
 4      P005          TechPro T-Shirt     Clothing  429.31,
   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
 0        T00001      C0199      P067  2024-08-25 12:38:23         1   
 1        T00112      C0146      P067  2024-05-27 22:23:54         1   
 2        T00166      C0127      P067  2024-04-25 07:38:55    

In [5]:
# Preprocessing the data

# Converting to their relevent formats
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

#adding customer details
transactions_full_df = transactions_df.merge(customers_df, on='CustomerID').merge(products_df, on='ProductID')
transactions_full_df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [6]:
target_customers = customers_df[customers_df['CustomerID'].isin([f"C{str(i).zfill(4)}" for i in range(1, 21)])]

customer_summary = transactions_full_df.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'ProductID': lambda x: list(x), 
    'Category': lambda x: list(x)   
}).rename(columns={'TotalValue': 'TotalSpent'})

# Merge summarized data 
customer_profile_df = target_customers.merge(customer_summary, on='CustomerID', how='left')

customer_profile_df.head()  


Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,TotalSpent,ProductID,Category
0,C0001,Lawrence Carroll,South America,2022-07-10,3354.52,"[P054, P022, P096, P083, P029]","[Books, Home Decor, Electronics, Electronics, ..."
1,C0002,Elizabeth Lutz,Asia,2022-02-13,1862.74,"[P095, P004, P019, P071]","[Home Decor, Home Decor, Clothing, Clothing]"
2,C0003,Michael Rivera,South America,2024-03-07,2725.38,"[P025, P006, P035, P002]","[Home Decor, Home Decor, Clothing, Electronics]"
3,C0004,Kathleen Rodriguez,South America,2022-10-09,5354.88,"[P049, P053, P038, P025, P097, P024, P008, P077]","[Books, Home Decor, Home Decor, Home Decor, Bo..."
4,C0005,Laura Weber,Asia,2022-08-15,2034.24,"[P025, P039, P012]","[Home Decor, Electronics, Electronics]"


In [7]:
# Preprocessing the data

# Convert necessary columns to appropriate type
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

# Merge datasets
transactions_full_df = transactions_df.merge(customers_df, on='CustomerID').merge(products_df, on='ProductID')

# taking 20 customers 
target_customers = customers_df[customers_df['CustomerID'].isin([f"C{str(i).zfill(4)}" for i in range(1, 21)])]


customer_summary = transactions_full_df.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'ProductID': lambda x: list(x), 
    'Category': lambda x: list(x)  
}).rename(columns={'TotalValue': 'TotalSpent'})

# Merge summary
customer_profile_df = target_customers.merge(customer_summary, on='CustomerID', how='left')

customer_profile_df.head() 


Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,TotalSpent,ProductID,Category
0,C0001,Lawrence Carroll,South America,2022-07-10,3354.52,"[P054, P022, P096, P083, P029]","[Books, Home Decor, Electronics, Electronics, ..."
1,C0002,Elizabeth Lutz,Asia,2022-02-13,1862.74,"[P095, P004, P019, P071]","[Home Decor, Home Decor, Clothing, Clothing]"
2,C0003,Michael Rivera,South America,2024-03-07,2725.38,"[P025, P006, P035, P002]","[Home Decor, Home Decor, Clothing, Electronics]"
3,C0004,Kathleen Rodriguez,South America,2022-10-09,5354.88,"[P049, P053, P038, P025, P097, P024, P008, P077]","[Books, Home Decor, Home Decor, Home Decor, Bo..."
4,C0005,Laura Weber,Asia,2022-08-15,2034.24,"[P025, P039, P012]","[Home Decor, Electronics, Electronics]"


In [9]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Helper functions 
def compute_categorical_similarity(cat_list_1, cat_list_2):
  
    vectorizer = CountVectorizer()
    combined_categories = [' '.join(cat_list_1), ' '.join(cat_list_2)]
    category_matrix = vectorizer.fit_transform(combined_categories)
    similarity = cosine_similarity(category_matrix)[0, 1]
    return similarity

def compute_similarity(customer1, customer2):
    
    # Categorical similarity 
    cat_similarity = compute_categorical_similarity(customer1['Category'], customer2['Category'])
    
    # Numerical similarity
    spent_similarity = 1 - abs(customer1['TotalSpent'] - customer2['TotalSpent']) / max(
        customer1['TotalSpent'], customer2['TotalSpent']
    )
    
    # Weighted average 
    overall_similarity = 0.7 * cat_similarity + 0.3 * spent_similarity
    return overall_similarity

# for the first 20 customers
lookalike_map = {}
target_customer_ids = target_customers['CustomerID'].tolist()

for customer_id in target_customer_ids:
    customer_data = customer_profile_df[customer_profile_df['CustomerID'] == customer_id].iloc[0]
    similarities = []
    
    for other_customer_id in target_customer_ids:
        if customer_id != other_customer_id:
            other_customer_data = customer_profile_df[customer_profile_df['CustomerID'] == other_customer_id].iloc[0]
            similarity_score = compute_similarity(customer_data, other_customer_data)
            similarities.append((other_customer_id, similarity_score))
    
    # Sort by similarity score and take top 3
    top_similar_customers = sorted(similarities, key=lambda x: x[1], reverse=True)[:3]
    lookalike_map[customer_id] = top_similar_customers

# Create the Lookalike.csv data
lookalike_df = pd.DataFrame.from_dict({
    'CustomerID': list(lookalike_map.keys()),
    'Lookalikes': [str(v) for v in lookalike_map.values()]
})

# Save to CSV
output_path = 'Ishmeet_Kaur_Lookalike.csv'
lookalike_df.to_csv(output_path, index=False)

output_path  


'Ishmeet_Kaur_Lookalike.csv'

In [None]:
%pip install seaborn