In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [3]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [5]:
transactions['Date'] = pd.to_datetime(transactions['TransactionDate']).dt.date

In [7]:
sales = pd.merge(customers,transactions,on = 'CustomerID',how = 'inner')

In [17]:
sales = pd.merge(sales, products, on=['ProductID', 'Price'], how='inner')

Create a single row per customer with meaningful aggregated metrics that represent their behavior.

- Normalization ensures that all features have the same scale and contribute equally to similarity computation.
- Features like TotalValue may have much larger scales than TransactionID or Price, skewing the similarity computation.


In [20]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Aggregate data for each customer
customer_features = sales.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'count',
    'Quantity': 'sum',
    'Price': 'mean'
}).reset_index()

# Normalize the features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features.iloc[:, 1:])  # Exclude CustomerID


Compute pairwise cosine similarity for all customers.
- Cosine Similarity:

   - Measures the cosine of the angle between two vectors (customer feature vectors in this case).
   - Outputs a similarity matrix where similarity_matrix[i][j] represents the similarity between Customer i and Customer j                           

In [13]:
# Compute cosine similarity
similarity_matrix = cosine_similarity(normalized_features)

# Convert to DataFrame for readability
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])


In [15]:
# Find top 3 similar customers for each customer
top_lookalikes = {}

for customer_id in similarity_df.index:
    # Sort by similarity score, exclude the customer itself
    similar_customers = similarity_df.loc[customer_id].sort_values(ascending=False).iloc[1:4]
    top_lookalikes[customer_id] = list(zip(similar_customers.index, similar_customers.values))

# Create a DataFrame for export
lookalike_data = []
for customer_id, lookalikes in top_lookalikes.items():
    for similar_id, score in lookalikes:
        lookalike_data.append({'CustomerID': customer_id, 'SimilarCustomerID': similar_id, 'SimilarityScore': score})

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv('Lookalike.csv', index=False)
