In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

In [None]:
# Load the datasets
customers_path = '/assets/Customers.csv'
products_path = '/assets/Products.csv'
transactions_path = '/assets/Transactions.csv'

# Reading the datasets
customers_df = pd.read_csv('/assets/Customers.csv')
products_df = pd.read_csv('/assets/Products.csv')
transactions_df = pd.read_csv('/assets/Transactions.csv')

In [None]:
# Merge Datasets
def merge_datasets():
    merged_df = transactions_df.merge(customers_df, on='CustomerID').merge(products_df, on='ProductID')
    return merged_df

# Feature Engineering
def create_customer_features(merged_df):
    # Aggregate transaction data for each customer
    customer_features = merged_df.groupby('CustomerID').agg(
        total_spent=('TotalValue', 'sum'),
        total_transactions=('TransactionID', 'count'),
        avg_transaction_value=('TotalValue', 'mean')
    ).reset_index()

    # Add product category preferences
    category_pivot = pd.pivot_table(
        merged_df,
        index='CustomerID',
        columns='Category',
        values='TotalValue',
        aggfunc='sum',
        fill_value=0
    )
    customer_features = customer_features.merge(category_pivot, on='CustomerID', how='left')

    # Add region information
    customer_features = customer_features.merge(customers_df[['CustomerID', 'Region']], on='CustomerID', how='left')

    # Encode region as dummy variables
    customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)

    return customer_features

In [None]:
# Similarity Computation
def compute_similarity(customer_features):
    # Standardize the features
    scaler = StandardScaler()
    feature_matrix = scaler.fit_transform(customer_features.iloc[:, 1:])

    # Compute cosine similarity
    similarity_matrix = cosine_similarity(feature_matrix)
    return similarity_matrix

# Generate Lookalike Recommendations
def generate_lookalikes(similarity_matrix, customer_ids, top_n=3):
    lookalike_map = {}

    for idx, cust_id in enumerate(customer_ids):
        # Get similarity scores for the current customer
        similarity_scores = list(enumerate(similarity_matrix[idx]))

        # Sort by similarity, exclude the customer itself
        sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        top_similar = [
            (customer_ids[i], round(score, 2))
            for i, score in sorted_scores[1:top_n + 1]  # Exclude self-match (first entry)
        ]

        # Store the result
        lookalike_map[cust_id] = top_similar

    return lookalike_map

In [None]:
# Save Lookalikes to CSV
def save_lookalikes(lookalike_map, output_path='Lookalike.csv'):
    lookalike_list = []
    for cust_id, lookalikes in lookalike_map.items():
        for similar_cust, score in lookalikes:
            lookalike_list.append({
                'CustomerID': cust_id,
                'SimilarCustomerID': similar_cust,
                'SimilarityScore': score
            })

    lookalike_df = pd.DataFrame(lookalike_list)
    lookalike_df.to_csv(output_path, index=False)

In [None]:
# Run the Model
if __name__ == "__main__":
    # Merge the datasets
    merged_df = merge_datasets()

    # Create customer features
    customer_features = create_customer_features(merged_df)

    # Compute similarity matrix
    similarity_matrix = compute_similarity(customer_features)

    # Generate lookalikes for first 20 customers
    first_20_customers = customers_df['CustomerID'].iloc[:20].tolist()
    lookalike_map = generate_lookalikes(similarity_matrix, customer_features['CustomerID'].tolist())

    # Save lookalikes for first 20 customers
    filtered_lookalikes = {cust_id: lookalike_map[cust_id] for cust_id in first_20_customers}
    save_lookalikes(filtered_lookalikes)

    print("Lookalike model completed. Results saved to Lookalike.csv.")