In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import csv



In [3]:
# Load datasets
customers = pd.read_csv('./data/Customers.csv')
products = pd.read_csv('./data/Products.csv')
transactions = pd.read_csv('./data/Transactions.csv')

# Display first few rows for verification
print(customers.head())
print(products.head())
print(transactions.head())


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

In [6]:
print(products.columns)  # Check columns in Products.csv
print(transactions.columns)  # Check columns in Transactions.csv


Index(['ProductID', 'ProductName', 'Category', 'Price'], dtype='object')
Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price'],
      dtype='object')


In [7]:
data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')
print(data.columns)  # Print all columns in the merged DataFrame


Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')


In [9]:
# Merge datasets
# Ensure Price is included correctly during the merge
data = transactions.merge(customers, on='CustomerID').merge(products[['ProductID', 'Price']], on='ProductID')

# Aggregate customer features
customer_features = data.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total value of all transactions
    'Quantity': 'sum',    # Total quantity purchased
    'Price_x': 'mean'       # Average price of purchased products
}).reset_index()

# Normalize features
scaler = StandardScaler()
features = scaler.fit_transform(customer_features[['TotalValue', 'Quantity', 'Price_x']])



In [10]:
# Compute similarity matrix
similarity_matrix = cosine_similarity(features)

# Create a lookalike dictionary for recommendations
lookalikes = {}
for i, customer in enumerate(customer_features['CustomerID']):
    similar_indices = np.argsort(-similarity_matrix[i])[1:4]  # Top 3 similar customers
    similar_customers = [(customer_features['CustomerID'][j], similarity_matrix[i][j]) for j in similar_indices]
    lookalikes[customer] = similar_customers


In [11]:
# Get recommendations for the first 20 customers (C0001 to C0020)
for customer_id in customer_features['CustomerID'][:20]:
    print(f"CustomerID: {customer_id}, Recommendations: {lookalikes[customer_id]}")


CustomerID: C0001, Recommendations: [('C0103', np.float64(0.9975729385618538)), ('C0092', np.float64(0.9968787968825864)), ('C0135', np.float64(0.9927364238882178))]
CustomerID: C0002, Recommendations: [('C0029', np.float64(0.9998543931340029)), ('C0077', np.float64(0.9961038168882547)), ('C0157', np.float64(0.9954784900159904))]
CustomerID: C0003, Recommendations: [('C0111', np.float64(0.9984874468302141)), ('C0190', np.float64(0.9966561574371822)), ('C0038', np.float64(0.9901332836738033))]
CustomerID: C0004, Recommendations: [('C0165', np.float64(0.9983897071764074)), ('C0162', np.float64(0.9980867096016259)), ('C0075', np.float64(0.996932345616167))]
CustomerID: C0005, Recommendations: [('C0167', np.float64(0.9999721868436701)), ('C0020', np.float64(0.99971426883456)), ('C0128', np.float64(0.9987615592886807))]
CustomerID: C0006, Recommendations: [('C0168', np.float64(0.9976122332196319)), ('C0196', np.float64(0.9950250564515252)), ('C0187', np.float64(0.9947524750205508))]
Custome

In [12]:
import csv
with open('FirstName_LastName_Lookalike.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['CustomerID', 'Lookalikes'])
    for key, value in lookalikes.items():
        writer.writerow([key, value])