In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime


from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, silhouette_score
from sklearn.decomposition import PCA

In [3]:
df_c = pd.read_csv('Customers.csv')
df_p = pd.read_csv('Products.csv')
df_t = pd.read_csv('Transactions.csv')

In [4]:
df_c.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [5]:
df_p.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [6]:
df_t.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


# Task 2: Lookalike Model

In [None]:
# Merging transactions with products to get category information
df_t = pd.merge(df_t, df_p, on='ProductID', how='left')

In [None]:
# We will first do feature engineering

# 1.Total spending per customer
total_spending = df_t.groupby('CustomerID')['TotalValue'].sum().reset_index()
total_spending.columns = ['CustomerID', 'TotalSpending']

# 2.Average transaction value
avg_transaction_value = df_t.groupby('CustomerID')['TotalValue'].mean().reset_index()
avg_transaction_value.columns = ['CustomerID', 'AvgTransactionValue']

# 3.Number of transactions per customer
num_transactions = df_t.groupby('CustomerID')['TransactionID'].count().reset_index()
num_transactions.columns = ['CustomerID', 'NumTransactions']

# 4.Favorite product category (most purchased category)
favorite_category = df_t.groupby(['CustomerID', 'Category']).size().reset_index(name='Count')
favorite_category = favorite_category.loc[favorite_category.groupby('CustomerID')['Count'].idxmax()]
favorite_category = favorite_category[['CustomerID', 'Category']]
favorite_category.columns = ['CustomerID', 'FavoriteCategory']

# 5.Customer lifetime (current date - signup date)
current_date = datetime.now()
df_c['SignupDate'] = pd.to_datetime(df_c['SignupDate'])
df_c['CustomerLifetime'] = (current_date - df_c['SignupDate']).dt.days

In [None]:
# We will merge all features into one dataframe
df_features = pd.merge(df_c, total_spending, on='CustomerID', how='left')
df_features = pd.merge(df_features, avg_transaction_value, on='CustomerID', how='left')
df_features = pd.merge(df_features, num_transactions, on='CustomerID', how='left')
df_features = pd.merge(df_features, favorite_category, on='CustomerID', how='left')

In [None]:
# Encoding categorical features (Region and FavoriteCategory)
df_features = pd.get_dummies(df_features, columns=['Region', 'FavoriteCategory'], drop_first=True)

In [27]:
# Lets check for missing values if any?
df_features = df_features.fillna(0)

In [28]:
# SIMILARITY

# Normalizing the features
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_features_scaled = scaler.fit_transform(df_features.drop(columns=['CustomerID', 'CustomerName', 'SignupDate']))

In [30]:
# cosine similarity
similarity_matrix = cosine_similarity(df_features_scaled)

In [31]:
# Converting similarities matrix to a dataframe
similarity_df = pd.DataFrame(similarity_matrix, index=df_features['CustomerID'], columns=df_features['CustomerID'])

In [32]:
similarity_df.head()

CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.0,0.451353,0.603502,0.815316,0.82502,0.764809,0.822512,0.302341,0.237784,0.322103,...,0.769062,0.995065,0.615506,0.334255,0.572725,0.421518,0.650501,0.34627,0.662761,0.430875
C0002,0.451353,1.0,0.263293,0.525279,0.517772,0.377732,0.534694,0.31126,0.718939,0.787203,...,0.330015,0.416091,0.693933,0.331615,0.217568,0.467216,0.362562,0.815699,0.393494,0.905576
C0003,0.603502,0.263293,1.0,0.723462,0.279992,0.763477,0.303774,0.592884,0.149958,0.201639,...,0.775139,0.586122,0.434213,0.284049,0.985075,0.620629,0.214562,0.19139,0.212567,0.336466
C0004,0.815316,0.525279,0.723462,1.0,0.506708,0.918849,0.531342,0.440556,0.286425,0.386759,...,0.938108,0.777464,0.750136,0.477874,0.720589,0.506426,0.373418,0.381868,0.399617,0.549905
C0005,0.82502,0.517772,0.279992,0.506708,1.0,0.4196,0.996328,0.296112,0.267967,0.363599,...,0.343787,0.807062,0.695931,0.336905,0.238323,0.483491,0.788163,0.410212,0.795542,0.475617


In [33]:
# top 3 lookalikes for a given customer
def get_top_lookalikes(customer_id, similarity_df, top_n=3):
    # similarity scores for the customer
    scores = similarity_df[customer_id]
    # Lets sort by similarity score and exclude the customer itself
    scores = scores.sort_values(ascending=False).drop(customer_id)
    return list(zip(scores.index[:top_n], scores.values[:top_n]))

In [34]:
lookalike_map = {}
for customer_id in df_features['CustomerID'][:20]:
    lookalike_map[customer_id] = get_top_lookalikes(customer_id, similarity_df)

In [35]:
# Converting the map to a dataframe
lookalike_data = []
for cust_id, lookalikes in lookalike_map.items():
    for lookalike_id, score in lookalikes:
        lookalike_data.append({'CustomerID': cust_id, 'LookalikeID': lookalike_id, 'SimilarityScore': score})

lookalike_df = pd.DataFrame(lookalike_data)

lookalike_df.to_csv('Lookalike.csv', index=False)