Importing Libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

load customer and transaction dataset 

In [12]:
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")

Merge customer data with transaction data

In [19]:
customer_transactions = pd.merge(transactions, customers, on='CustomerID')
customer_transactions

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15
...,...,...,...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86,Jacob Holt,South America,2022-01-22
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86,Mrs. Kimberly Wright,North America,2024-04-07
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86,Tyler Haynes,North America,2024-09-21
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86,Joshua Hamilton,Asia,2024-11-11


Aggregating the transaction data  - total spent and purchase frequency


In [41]:
customer_profile = customer_transactions.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    purchase_frequency=('TransactionID', 'count')
).reset_index()

customer_profile.head()

Unnamed: 0,CustomerID,total_spent,purchase_frequency
0,C0001,3354.52,5
1,C0002,1862.74,4
2,C0003,2725.38,4
3,C0004,5354.88,8
4,C0005,2034.24,3


Now, Adding region information to the customer profile
Customers' preferences and behavior can vary significantly across regions. Including the Region column ensures that geographical differences are considered when calculating similarity.

For example, a customer in Europe might have different product preferences compared to someone in North America.

In [42]:
customer_profile = pd.merge(customer_profile, customers[['CustomerID', 'Region']], on='CustomerID')
customer_profile.head(6)

Unnamed: 0,CustomerID,total_spent,purchase_frequency,Region
0,C0001,3354.52,5,South America
1,C0002,1862.74,4,Asia
2,C0003,2725.38,4,South America
3,C0004,5354.88,8,South America
4,C0005,2034.24,3,Asia
5,C0006,4227.57,4,South America


In [43]:
print(customer_profile.columns)

Index(['CustomerID', 'total_spent', 'purchase_frequency', 'Region'], dtype='object')


Convert 'Region' to numeric category

In [44]:
customer_profile['Region'] = customer_profile['Region'].astype('category').cat.codes

In [45]:
customer_profile

Unnamed: 0,CustomerID,total_spent,purchase_frequency,Region
0,C0001,3354.52,5,3
1,C0002,1862.74,4,0
2,C0003,2725.38,4,3
3,C0004,5354.88,8,3
4,C0005,2034.24,3,0
...,...,...,...,...
194,C0196,4982.88,4,1
195,C0197,1928.65,3,1
196,C0198,931.83,2,1
197,C0199,1979.28,4,1


Standardize the numerical features: total_spent and purchase_frequency


In [46]:
scaler = StandardScaler()

In [47]:
customer_profile[['total_spent', 'purchase_frequency']] = scaler.fit_transform(
    customer_profile[['total_spent', 'purchase_frequency']]
)


In [48]:
customer_profile.head()

Unnamed: 0,CustomerID,total_spent,purchase_frequency,Region
0,C0001,-0.061701,-0.011458,3
1,C0002,-0.877744,-0.467494,0
2,C0003,-0.405857,-0.467494,3
3,C0004,1.032547,1.35665,3
4,C0005,-0.783929,-0.92353,0


### Calculate Cosine Similarity
Extract relevant columns for similarity calculation


In [51]:
customer_features = customer_profile[['total_spent', 'purchase_frequency', 'Region']]


In [52]:
customer_features

Unnamed: 0,total_spent,purchase_frequency,Region
0,-0.061701,-0.011458,3
1,-0.877744,-0.467494,0
2,-0.405857,-0.467494,3
3,1.032547,1.356650,3
4,-0.783929,-0.923530,0
...,...,...,...
194,0.829053,-0.467494,1
195,-0.841689,-0.923530,1
196,-1.386975,-1.379566,1
197,-0.813993,-0.467494,1


In [53]:
# Calculating the similarity matrix between all customers
similarity_matrix = cosine_similarity(customer_features)

In [54]:
similarity_matrix

array([[ 1.        ,  0.01994403,  0.98245697, ...,  0.47044108,
         0.74245052, -0.02049801],
       [ 0.01994403,  1.        ,  0.18868476, ...,  0.85237702,
         0.6840532 , -0.87487767],
       [ 0.98245697,  0.18868476,  1.        , ...,  0.62524345,
         0.84471171, -0.13000125],
       ...,
       [ 0.47044108,  0.85237702,  0.62524345, ...,  1.        ,
         0.92055716, -0.62103057],
       [ 0.74245052,  0.6840532 ,  0.84471171, ...,  0.92055716,
         1.        , -0.58787989],
       [-0.02049801, -0.87487767, -0.13000125, ..., -0.62103057,
        -0.58787989,  1.        ]])

In [57]:
# Finding top3 lookalikes for first 20 customers (C0001 - C0020)
lookalike_recommendations = {}

for customer_id in customer_profile['CustomerID'][:20]:
    i = customer_profile[customer_profile['CustomerID'] == customer_id].index[0]
    
    # Get similarity scores with all other customers
    similarity_scores = similarity_matrix[i]
    
    # Sort the scores (excluding the customer itself)
    similar_customer_indices = np.argsort(similarity_scores)[::-1][1:4]  # Top 3
    
    similar_customers = customer_profile.iloc[similar_customer_indices][['CustomerID', 'Region']]
    scores = similarity_scores[similar_customer_indices]
    
    # Store recommendations for the current customer
    lookalike_recommendations[customer_id] = [
    (similar_customer.CustomerID, score)  # Use attribute-style access
    for similar_customer, score in zip(similar_customers.itertuples(), scores)
]


In [58]:
lookalike_df = pd.DataFrame([(customer_id, similar_customer, score) 
                             for customer_id, recommendations in lookalike_recommendations.items() 
                             for similar_customer, score in recommendations],
                            columns=['CustomerID', 'LookalikeID', 'SimilarityScore'])


In [59]:
lookalike_df

Unnamed: 0,CustomerID,LookalikeID,SimilarityScore
0,C0001,C0137,0.999992
1,C0001,C0152,0.999984
2,C0001,C0107,0.998794
3,C0002,C0142,0.992988
4,C0002,C0043,0.990709
5,C0002,C0177,0.971508
6,C0003,C0133,0.999601
7,C0003,C0052,0.999128
8,C0003,C0125,0.997232
9,C0004,C0122,0.997043


In [60]:
lookalike_df.to_csv("Sarika_Kushwaha_Lookalike.csv", index=False)