#### Business Problem Understanding:
- Build a model that takes user's information as input and recommends 3 simillar customers based on their **profile** and **transaction** history.

In [20]:
# Load Necessary Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter('ignore')

In [6]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [7]:
# Merge transactions with product data to get product category information
transactions = transactions.merge(products[['ProductID', 'Category']], on='ProductID', how='left')

In [8]:
# Create customer profile features
customer_profiles = transactions.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    purchase_count=('TransactionID', 'count'),
    avg_purchase_value=('TotalValue', 'mean'),
    diversity_of_categories=('Category', lambda x: len(x.unique())),
    most_frequent_category=('Category', lambda x: x.mode()[0] if not x.mode().empty else 'Unknown'),
    latest_transaction=('TransactionDate', lambda x: max(pd.to_datetime(x)))
).reset_index()

In [9]:
# Add recency feature (days since last transaction)
customer_profiles['recency_days'] = (pd.to_datetime('today') - customer_profiles['latest_transaction']).dt.days

In [10]:
# Merge with customer demographics
customer_profiles = customer_profiles.merge(customers[['CustomerID', 'Region', 'SignupDate']], on='CustomerID', how='left')

In [11]:
# Calculate customer age in days
customer_profiles['SignupDate'] = pd.to_datetime(customer_profiles['SignupDate'])
customer_profiles['customer_age'] = (pd.to_datetime('today') - customer_profiles['SignupDate']).dt.days

In [12]:
# Drop unnecessary columns
customer_profiles.drop(columns=['SignupDate', 'latest_transaction'], inplace=True)

In [13]:
# Ensure categorical columns are strings
customer_profiles['Region'] = customer_profiles['Region'].astype(str)
customer_profiles['most_frequent_category'] = customer_profiles['most_frequent_category'].astype(str)

In [17]:
# Define Preprocessor for Categorical and Numeric Data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
encoder = ColumnTransformer(
    transformers=[
        ('region', OneHotEncoder(sparse_output=False), ['Region']),
        ('most_frequent_category', OneHotEncoder(sparse_output=False), ['most_frequent_category'])
    ],
    remainder='passthrough'
)

# Continuous features to scale
scaler = StandardScaler()
continuous_columns = ['total_spend', 'purchase_count', 'avg_purchase_value', 
                      'diversity_of_categories', 'recency_days', 'customer_age']

In [18]:
# Transform data
transformed_data = encoder.fit_transform(customer_profiles.drop(columns=['CustomerID']))
encoded_columns = encoder.get_feature_names_out()
scaled_continuous_data = scaler.fit_transform(customer_profiles[continuous_columns])

In [21]:
# Combine transformed data
final_data = np.hstack([transformed_data, scaled_continuous_data])

In [22]:
# Similarity Calculation using Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(final_data)

In [23]:
# Get top 3 similar customers for each customer (for C0001 - C0020)
top_3_lookalikes = {}

for i in range(20):  # First 20 customers (C0001 to C0020)
    customer_id = customer_profiles.loc[i, 'CustomerID']
    similarity_scores = cosine_sim[i]
    
    # Get top 3 most similar customers (excluding self)
    similar_customers_indices = np.argsort(similarity_scores)[::-1][1:4]
    similar_customers = [
        (customer_profiles.loc[j, 'CustomerID'], round(similarity_scores[j], 4))
        for j in similar_customers_indices
    ]
    top_3_lookalikes[customer_id] = similar_customers

In [24]:
# Save the map to Lookalike.csv
lookalike_map = []

for customer_id, similar_list in top_3_lookalikes.items():
    lookalike_map.append({
        'cust_id': customer_id,
        'lookalikes': similar_list
    })

lookalike_df = pd.DataFrame(lookalike_map)

In [25]:
lookalike_df.head()

Unnamed: 0,cust_id,lookalikes
0,C0001,"[(C0174, 0.9999), (C0106, 0.9999), (C0088, 0.9..."
1,C0002,"[(C0029, 0.9999), (C0025, 0.998), (C0121, 0.99..."
2,C0003,"[(C0052, 0.9999), (C0177, 0.9995), (C0031, 0.9..."
3,C0004,"[(C0104, 1.0), (C0165, 0.9999), (C0188, 0.9998)]"
4,C0005,"[(C0159, 0.9998), (C0176, 0.9992), (C0132, 0.9..."


In [26]:
# Save the output to Lookalike.csv
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike Model has been generated and saved to Lookalike.csv")

Lookalike Model has been generated and saved to Lookalike.csv
