In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity


In [23]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [24]:
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])
transactions = transactions.merge(products, on=['ProductID','Price'], how='left')

In [25]:
transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,ProductName,Category
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics


In [26]:
customer_transactions = (
    transactions.groupby('CustomerID')
    .agg({
        'TotalValue': 'sum',
        'Quantity': 'sum',
        'Price': 'mean',
        'Category': lambda x: x.mode()[0],  # Most common category
    })
    .reset_index()
)

In [43]:
profiles = customers.merge(customer_transactions, on='CustomerID', how='left')

profiles.fillna(0, inplace=True)
profiles.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,TotalValue,Quantity,Price,Category
0,C0001,Lawrence Carroll,South America,2022-07-10,3354.52,12.0,278.334,Electronics
1,C0002,Elizabeth Lutz,Asia,2022-02-13,1862.74,10.0,208.92,Clothing
2,C0003,Michael Rivera,South America,2024-03-07,2725.38,14.0,195.7075,Home Decor
3,C0004,Kathleen Rodriguez,South America,2022-10-09,5354.88,23.0,240.63625,Books
4,C0005,Laura Weber,Asia,2022-08-15,2034.24,7.0,291.603333,Electronics


In [44]:
profiles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   CustomerID    200 non-null    object 
 1   CustomerName  200 non-null    object 
 2   Region        200 non-null    object 
 3   SignupDate    200 non-null    object 
 4   TotalValue    200 non-null    float64
 5   Quantity      200 non-null    float64
 6   Price         200 non-null    float64
 7   Category      200 non-null    object 
dtypes: float64(3), object(5)
memory usage: 12.6+ KB


In [49]:
profiles = profiles.convert_dtypes()

In [50]:
profiles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   CustomerID    200 non-null    string 
 1   CustomerName  200 non-null    string 
 2   Region        200 non-null    string 
 3   SignupDate    200 non-null    string 
 4   TotalValue    200 non-null    Float64
 5   Quantity      200 non-null    Int64  
 6   Price         200 non-null    Float64
 7   Category      200 non-null    string 
dtypes: Float64(2), Int64(1), string(5)
memory usage: 13.2 KB


In [51]:
encoder = OneHotEncoder(sparse_output=False)
encoded_features = encoder.fit_transform(profiles[['Region', 'Category']])
encoded_features

array([[0., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 1., 0., 0.]])

In [52]:
numerical_features = profiles[['TotalValue', 'Quantity', 'Price']]
features = pd.concat(
    [pd.DataFrame(encoded_features), numerical_features.reset_index(drop=True)], axis=1
)

In [53]:
similarity_matrix = cosine_similarity(features)


In [54]:
recommendations = {}
for idx, customer_id in enumerate(profiles['CustomerID']):
    similar_scores = list(enumerate(similarity_matrix[idx]))
    similar_scores = sorted(similar_scores, key=lambda x: x[1], reverse=True)[1:4]

    recommendations[customer_id] = [
        (profiles['CustomerID'].iloc[i], round(score, 2)) for i, score in similar_scores
    ]

In [55]:
lookalike_map = {cust_id: recommendations[cust_id] for cust_id in profiles['CustomerID'][:20]}


In [56]:
lookalike_map

{'C0001': [('C0024', 1.0), ('C0189', 1.0), ('C0107', 1.0)],
 'C0002': [('C0129', 1.0), ('C0019', 1.0), ('C0076', 1.0)],
 'C0003': [('C0179', 1.0), ('C0190', 1.0), ('C0064', 1.0)],
 'C0004': [('C0045', 1.0), ('C0143', 1.0), ('C0087', 1.0)],
 'C0005': [('C0132', 1.0), ('C0089', 1.0), ('C0192', 1.0)],
 'C0006': [('C0152', 1.0), ('C0011', 1.0), ('C0168', 1.0)],
 'C0007': [('C0085', 1.0), ('C0061', 1.0), ('C0192', 1.0)],
 'C0008': [('C0162', 1.0), ('C0018', 1.0), ('C0182', 1.0)],
 'C0009': [('C0080', 1.0), ('C0020', 1.0), ('C0015', 1.0)],
 'C0010': [('C0047', 1.0), ('C0030', 1.0), ('C0027', 1.0)],
 'C0011': [('C0006', 1.0), ('C0152', 1.0), ('C0183', 1.0)],
 'C0012': [('C0093', 1.0), ('C0046', 1.0), ('C0039', 1.0)],
 'C0013': [('C0108', 1.0), ('C0105', 1.0), ('C0018', 1.0)],
 'C0014': [('C0150', 1.0), ('C0130', 1.0), ('C0078', 1.0)],
 'C0015': [('C0083', 1.0), ('C0020', 1.0), ('C0080', 1.0)],
 'C0016': [('C0040', 1.0), ('C0158', 1.0), ('C0092', 1.0)],
 'C0017': [('C0053', 1.0), ('C0136', 1.0

In [57]:
lookalike_df = pd.DataFrame(
    [(k, v) for k, v in lookalike_map.items()], columns=['cust_id', 'lookalikes']
)
lookalike_df.to_csv('Jash_Jummani_Lookalike.csv', index=False)