In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [2]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [3]:
merged_df = pd.merge(transactions, customers, on='CustomerID', how='inner')
merged_df = pd.merge(merged_df, products, on='ProductID', how='inner')

In [4]:
#aggregate transaction data for each customer
customer_features = merged_df.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    num_transactions=('TransactionID', 'count'),
    avg_transaction_value=('TotalValue', 'mean')
).reset_index()

In [5]:
#number of unique categories purchased by each customer
category_features = merged_df.groupby('CustomerID')['Category'].nunique().reset_index()
category_features.columns = ['CustomerID', 'num_categories']

customer_features = pd.merge(customer_features, category_features, on='CustomerID', how='left')

In [6]:
#normalize the features (for similarity calculations)
scaler = StandardScaler()
features_to_normalize = ['total_spent', 'num_transactions', 'avg_transaction_value', 'num_categories']
customer_features[features_to_normalize] = scaler.fit_transform(customer_features[features_to_normalize])

In [7]:
#calculate cosine similarity between customers based on the features
similarity_matrix = cosine_similarity(customer_features.drop('CustomerID', axis=1))
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

In [8]:
#lookalikes for the first 20 customers (C0001 to C0020)
lookalikes = {}
for customer in customer_features['CustomerID'][:20]:  # Considering customers C0001 to C0020
    # Sort customers by similarity score in descending order, excluding the customer itself
    top_3 = similarity_df[customer].sort_values(ascending=False).iloc[1:4]
    lookalikes[customer] = list(zip(top_3.index, top_3.values))

In [9]:
#lookalike.csv data
lookalike_data = []
for customer, top_3 in lookalikes.items():
    for lookalike, score in top_3:
        lookalike_data.append([customer, lookalike, score])

lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

In [10]:
#saving the output to Lookalike.csv
lookalike_df.to_csv('Lavanya_Kohli_Lookalike.csv', index=False)
print("Lookalike model is generated and saved as 'Lookalike.csv'")

Lookalike model is generated and saved as 'Lookalike.csv'
