In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
import csv
import json

customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")




The SignupDate field in the customer dataset is converted to a datetime format for potential temporal analysis.

In [None]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions_customers = pd.merge(transactions, customers, on="CustomerID", how="inner")
full_data = pd.merge(transactions_customers, products, on="ProductID", how="inner")

Numerical features are scaled using StandardScaler to normalize the data. This ensures that all features contribute equally to the similarity calculation, avoiding bias from features with larger magnitudes.

In [None]:
def create_customer_features(df):
    category_counts = df.groupby(['CustomerID', 'Category'])['Quantity'].sum().unstack(fill_value=0)

    total_spending = df.groupby('CustomerID')['TotalValue'].sum()

    average_price_per_category = df.groupby(['CustomerID','Category'])['Price_y'].mean().unstack(fill_value=0)

    average_quantities=df.groupby(['CustomerID', 'Category'])['Quantity'].mean().unstack(fill_value=0)

    customer_profiles = pd.concat([category_counts, total_spending, average_price_per_category, average_quantities], axis=1)
    customer_profiles = customer_profiles.fillna(0)
    return customer_profiles

customer_profiles = create_customer_features(full_data)
print("Customer Profiles Sample:")
print(customer_profiles.head())

scaler = StandardScaler()
scaled_customer_profiles = scaler.fit_transform(customer_profiles)

scaled_customer_profiles_df=pd.DataFrame(scaled_customer_profiles, index = customer_profiles.index, columns = customer_profiles.columns)
print("\nScaled customer profiles sample")
print(scaled_customer_profiles_df.head())



In [None]:
def get_lookalikes(scaled_profiles_df, num_lookalikes=3):
    similarity_matrix = cosine_similarity(scaled_profiles_df)
    lookalikes_dict = defaultdict(list)
    for i, customer_id in enumerate(scaled_profiles_df.index):
        similarities = similarity_matrix[i]

        most_similar_indices = np.argsort(similarities)[::-1][1:num_lookalikes + 1]
        most_similar_scores=similarities[most_similar_indices]

        lookalike_ids=[scaled_profiles_df.index[j] for j in most_similar_indices]
        lookalikes_dict[customer_id] = list(zip(lookalike_ids, most_similar_scores))
    return lookalikes_dict

lookalikes = get_lookalikes(scaled_customer_profiles_df)

print("\nGenerated Lookalikes (sample):")
print(list(lookalikes.items())[:5])

# Extracting lookalikes for the first 20 customers
first_20_lookalikes = {}
for customer_id, lookalike_list in list(lookalikes.items())[:20]:
    first_20_lookalikes[customer_id]=lookalike_list

print("\nFirst 20 lookalikes with similarity scores:")
print(first_20_lookalikes)

# Saving the lookalike recommendations into a csv file
with open('Lookalike.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["CustomerID", "Lookalikes"])
    for customer_id, lookalike_list in first_20_lookalikes.items():
        writer.writerow([customer_id, json.dumps(lookalike_list)])