In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [2]:
customer_df = pd.read_csv('Customers.csv')
product_df = pd.read_csv('Products.csv')
transaction_df = pd.read_csv('Transactions.csv')

In [3]:
customer_df.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [4]:
product_df.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [5]:
transaction_df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [6]:
customer_df.isnull().sum()

CustomerID      0
CustomerName    0
Region          0
SignupDate      0
dtype: int64

In [7]:
product_df.isnull().sum()

ProductID      0
ProductName    0
Category       0
Price          0
dtype: int64

In [8]:
transaction_df.isnull().sum()

TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price              0
dtype: int64

In [10]:
df=transaction_df.merge(customer_df,on='CustomerID',how='left').merge(product_df,on='ProductID',how='left')
df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [11]:
customer_info = df.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'count',
    'Category': lambda x: ','.join(x.unique()),  # Aggregate unique categories
    'Region': 'first',
    'SignupDate': 'first'
}).reset_index()


In [12]:
customer_info['SignupDateRecency'] = (pd.Timestamp('now') - pd.to_datetime(customer_info['SignupDate'])).dt.days
category_fake = customer_info['Category'].str.get_dummies(sep=',')
region_fake = pd.get_dummies(customer_info['Region'])

In [13]:
features = pd.concat([
    customer_info[['TotalValue', 'TransactionID', 'SignupDateRecency']],
    category_fake,
    region_fake
], axis=1)

In [16]:
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(features)

In [17]:
similarity_matrix = cosine_similarity(normalized_features)

In [19]:
lookalikes = {}
for i, customer_id in enumerate(customer_info['CustomerID'][:20]):
    similarities = list(enumerate(similarity_matrix[i]))
    sorted_similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[1:4]  # Skip self-comparison
    lookalikes[customer_id] = [(customer_info['CustomerID'][j], score) for j, score in sorted_similarities]

In [20]:
lookalike_df = pd.DataFrame({
    'CustomerID': lookalikes.keys(),
    'Lookalikes': [str(v) for v in lookalikes.values()]
})
lookalike_df.to_csv('Lookalike.csv', index=False)

In [21]:
print(lookalike_df)

   CustomerID                                         Lookalikes
0       C0001  [('C0152', 0.9995089000338947), ('C0174', 0.99...
1       C0002  [('C0134', 0.9981234118231392), ('C0159', 0.99...
2       C0003  [('C0031', 0.9993387540127565), ('C0129', 0.99...
3       C0004  [('C0148', 0.9894684086022095), ('C0001', 0.98...
4       C0005  [('C0007', 0.9993715903975181), ('C0140', 0.98...
5       C0006  [('C0076', 0.9950437771201375), ('C0187', 0.99...
6       C0007  [('C0005', 0.9993715903975181), ('C0140', 0.98...
7       C0008  [('C0024', 0.9928554429115), ('C0194', 0.99224...
8       C0009  [('C0198', 0.9686607787292308), ('C0066', 0.95...
9       C0010  [('C0132', 0.9983950796697526), ('C0061', 0.99...
10      C0011  [('C0107', 0.9997239354871995), ('C0192', 0.99...
11      C0012  [('C0085', 0.976017586724106), ('C0148', 0.974...
12      C0013  [('C0087', 0.9995642584349319), ('C0126', 0.99...
13      C0014  [('C0089', 0.9857568887129862), ('C0060', 0.90...
14      C0015  [('C0131',