In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


In [6]:
# Load the datasets
customers = pd.read_csv(r"C:\Users\admin\Desktop\dtintern\dataset\Customers.csv")
products = pd.read_csv(r"C:\Users\admin\Desktop\dtintern\dataset\Products.csv")
transactions = pd.read_csv(r"C:\Users\admin\Desktop\dtintern\dataset\Transactions.csv")

In [7]:
# Merge transactions with customers
merged_data = pd.merge(transactions, customers, on="CustomerID")

# Merge the above with products
merged_data = pd.merge(merged_data, products, on="ProductID")

# Preview the merged dataset
print("Merged Dataset")
print(merged_data.head())


Merged Dataset
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bl

In [9]:
# Aggregating customer transaction history
customer_features = merged_data.groupby("CustomerID").agg({
    'TotalValue': 'sum',        # Total revenue
    'Quantity': 'sum',          # Total items purchased
    'Price_y': 'mean',            # Average price of products purchased
    'Region': 'first'           # Region of the customer
}).reset_index()

# One-hot encoding Region
customer_features = pd.get_dummies(customer_features, columns=["Region"])

In [10]:
# Normalize numeric features
scaler = StandardScaler()
numeric_features = ['TotalValue', 'Quantity', 'Price_y']
customer_features[numeric_features] = scaler.fit_transform(customer_features[numeric_features])

In [11]:
# Calculate pairwise cosine similarity
similarity_matrix = cosine_similarity(customer_features[numeric_features])

# Add CustomerID back for identification
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])


In [12]:
# Function to get top 3 similar customers
def get_top_similar(customer_id, similarity_df):
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]  # Exclude self
    return [(index, score) for index, score in similar_customers.items()]

# Get lookalikes for the first 20 customers
lookalikes = {}
for customer_id in customer_features['CustomerID'][:20]:
    lookalikes[customer_id] = get_top_similar(customer_id, similarity_df)

# Convert to DataFrame and save
lookalike_df = pd.DataFrame({
    "CustomerID": lookalikes.keys(),
    "Lookalikes": lookalikes.values()
})
lookalike_df.to_csv("FirstName_LastName_Lookalike.csv", index=False)
