In [1]:
# Lookalike Model Development for eCommerce Dataset

# Importing Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Loading the Datasets
customers = pd.read_csv('/content/drive/MyDrive/Customers.csv')
products = pd.read_csv('/content/drive/MyDrive/Products.csv')
transactions = pd.read_csv('/content/drive/MyDrive/Transactions.csv')

# Data Preprocessing
# Merge datasets to create a comprehensive dataset
transactions_products = transactions.merge(products, on='ProductID')
full_data = transactions_products.merge(customers, on='CustomerID')

# Encoding categorical data (e.g., 'Category') in full_data before aggregation
if 'Category' in full_data.columns:
    full_data = pd.get_dummies(full_data, columns=['Category'], drop_first=True)

# Aggregating transaction data for customers
customer_profiles = full_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'ProductName': lambda x: x.mode()[0] if not x.mode().empty else None  # Most common product
}).reset_index()

# Adding average price
if 'Price_x' in full_data.columns:
    customer_profiles['Price'] = full_data.groupby('CustomerID')['Price_x'].mean().values
elif 'Price_y' in full_data.columns:
    customer_profiles['Price'] = full_data.groupby('CustomerID')['Price_y'].mean().values

# Scaling numerical features
scaler = StandardScaler()
numerical_features = ['TotalValue', 'Quantity']
if 'Price' in customer_profiles.columns:
    numerical_features.append('Price')
customer_profiles[numerical_features] = scaler.fit_transform(customer_profiles[numerical_features])

# Calculating Similarity Scores
# Compute cosine similarity between all customers
customer_features = customer_profiles.drop(columns=['CustomerID', 'ProductName'], errors='ignore')
similarity_matrix = cosine_similarity(customer_features)

# Creating Lookalike Recommendations
# Extract top 3 similar customers for each customer
lookalike_dict = {}
customer_ids = customer_profiles['CustomerID'].values
for idx, customer_id in enumerate(customer_ids):
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:4]  # Exclude self (score=1.0)
    lookalike_dict[customer_id] = [(customer_ids[i], score) for i, score in sorted_scores]

# Generating Lookalike.csv
lookalike_data = []
for cust_id, recommendations in lookalike_dict.items():
    for rec_id, score in recommendations:
        lookalike_data.append({'CustomerID': cust_id, 'LookalikeID': rec_id, 'Score': score})

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display Top 3 Lookalikes for First 20 Customers
first_20_customers = customer_ids[:20]
for cust_id in first_20_customers:
    print(f"CustomerID: {cust_id}")
    print(lookalike_dict[cust_id])
    print("\n")

print("Lookalike Model completed and Lookalike.csv generated.")


CustomerID: C0001
[('C0103', 0.9975729385618538), ('C0092', 0.9968787968825864), ('C0135', 0.9927364238882178)]


CustomerID: C0002
[('C0029', 0.9998543931340029), ('C0077', 0.9961038168882547), ('C0157', 0.9954784900159904)]


CustomerID: C0003
[('C0111', 0.9984874468302141), ('C0190', 0.9966561574371822), ('C0038', 0.9901332836738033)]


CustomerID: C0004
[('C0165', 0.9983897071764074), ('C0162', 0.9980867096016259), ('C0075', 0.996932345616167)]


CustomerID: C0005
[('C0167', 0.9999721868436701), ('C0020', 0.99971426883456), ('C0128', 0.9987615592886807)]


CustomerID: C0006
[('C0168', 0.9976122332196319), ('C0196', 0.9950250564515252), ('C0187', 0.9947524750205508)]


CustomerID: C0007
[('C0125', 0.9998486580402707), ('C0089', 0.99834375759003), ('C0085', 0.9960335186380587)]


CustomerID: C0008
[('C0084', 0.9960866913262758), ('C0113', 0.9958170325568012), ('C0017', 0.993173208985394)]


CustomerID: C0009
[('C0130', 0.9999651017117013), ('C0128', 0.9985963548763069), ('C0192', 0.9