In [24]:
# importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [2]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [3]:
data = pd.merge(transactions, customers, how='left', on='CustomerID')

In [4]:
data = pd.merge(data, products, how='left', on='ProductID')

In [5]:
data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [6]:
data.shape

(1000, 13)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TransactionID    1000 non-null   object 
 1   CustomerID       1000 non-null   object 
 2   ProductID        1000 non-null   object 
 3   TransactionDate  1000 non-null   object 
 4   Quantity         1000 non-null   int64  
 5   TotalValue       1000 non-null   float64
 6   Price_x          1000 non-null   float64
 7   CustomerName     1000 non-null   object 
 8   Region           1000 non-null   object 
 9   SignupDate       1000 non-null   object 
 10  ProductName      1000 non-null   object 
 11  Category         1000 non-null   object 
 12  Price_y          1000 non-null   float64
dtypes: float64(3), int64(1), object(9)
memory usage: 101.7+ KB


## Data prepration

In [15]:
# Aggregate customer features
customer_features = data.groupby("CustomerID").agg(
    TotalRevenue=("TotalValue", "sum"),
    TransactionCount=("TransactionID", "count"),
    AverageTransactionValue=("TotalValue", "mean"),
    Region=("Region", "first")
).reset_index()
customer_features.head()

Unnamed: 0,CustomerID,TotalRevenue,TransactionCount,AverageTransactionValue,Region
0,C0001,3354.52,5,670.904,South America
1,C0002,1862.74,4,465.685,Asia
2,C0003,2725.38,4,681.345,South America
3,C0004,5354.88,8,669.36,South America
4,C0005,2034.24,3,678.08,Asia


In [16]:
product_category_features = pd.crosstab(data["CustomerID"], data["Category"])

In [17]:
customer_features = pd.merge(customer_features, product_category_features, on="CustomerID")

In [18]:
customer_features = pd.get_dummies(customer_features, columns=["Region"], drop_first=True)

In [40]:
#scaling
scaler = StandardScaler()
feature_cols = (
    ["TotalRevenue", "TransactionCount", "AverageTransactionValue"]
    + list(product_category_features.columns)
    + list(customer_features.columns[4:])
)
normalized_features = scaler.fit_transform(customer_features[feature_cols])

## Modeling

In [26]:
similarity_matrix = cosine_similarity(normalized_features)

In [27]:
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features["CustomerID"], columns=customer_features["CustomerID"])

In [30]:
lookalike_data = {}
for customer in similarity_df.index:
    similar_customers = similarity_df.loc[customer].drop(customer).sort_values(ascending=False)[:3]
    lookalike_data[customer] = [(similar_customer, round(score, 4)) for similar_customer, score in similar_customers.items()]
lookalike_data

{'C0001': [('C0120', 0.853), ('C0091', 0.8485), ('C0190', 0.8382)],
 'C0002': [('C0134', 0.9204), ('C0106', 0.8839), ('C0159', 0.8622)],
 'C0003': [('C0031', 0.9566), ('C0158', 0.9138), ('C0129', 0.865)],
 'C0004': [('C0113', 0.9058), ('C0012', 0.8658), ('C0104', 0.8151)],
 'C0005': [('C0007', 0.9612), ('C0146', 0.8917), ('C0140', 0.8394)],
 'C0006': [('C0187', 0.8785), ('C0171', 0.7158), ('C0153', 0.6688)],
 'C0007': [('C0005', 0.9612), ('C0140', 0.8925), ('C0146', 0.8396)],
 'C0008': [('C0162', 0.8131), ('C0059', 0.8119), ('C0154', 0.7952)],
 'C0009': [('C0198', 0.9237), ('C0111', 0.8641), ('C0062', 0.8498)],
 'C0010': [('C0061', 0.8997), ('C0111', 0.8582), ('C0009', 0.831)],
 'C0011': [('C0126', 0.9624), ('C0153', 0.8879), ('C0171', 0.8821)],
 'C0012': [('C0104', 0.9394), ('C0152', 0.8742), ('C0004', 0.8658)],
 'C0013': [('C0107', 0.8584), ('C0102', 0.8547), ('C0188', 0.8501)],
 'C0014': [('C0060', 0.9827), ('C0097', 0.8572), ('C0198', 0.8519)],
 'C0015': [('C0036', 0.9223), ('C0131

In [33]:
lookalike_df = pd.DataFrame.from_dict(lookalike_data, orient='index')

In [38]:
final_df = lookalike_df.iloc[:20,:]

In [39]:
# exporting to csv
final_df.to_csv('Jatin_Jaglan_Lookalike.csv')