In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import json

# Load the datasets
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

def preprocess_data():
    # Convert dates to datetime
    customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
    transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

    # Create customer feature matrix
    # 1. Calculate customer transaction statistics
    customer_stats = transactions_df.groupby('CustomerID').agg({
        'TotalValue': ['sum', 'mean', 'count'],
        'Quantity': ['sum', 'mean'],
        'Price': ['mean']
    })
     # Flatten the MultiIndex columns and join them with '_'
    customer_stats.columns = ['_'.join(col).strip() for col in customer_stats.columns.values]
    customer_stats = customer_stats.reset_index() # Reset index to make 'CustomerID' a column

    # 2. Get customer regions (one-hot encoded)
    customer_regions = pd.get_dummies(customers_df['Region'], prefix='region')
    customer_regions['CustomerID'] = customers_df['CustomerID']

    # 3. Calculate days since signup
    customers_df['DaysSinceSignup'] = (pd.Timestamp.now() - customers_df['SignupDate']).dt.days

    # 4. Get customer category preferences
    merged_df = transactions_df.merge(products_df, on='ProductID')
    category_preferences = pd.crosstab(merged_df['CustomerID'], merged_df['Category'])

    # Combine all features
    feature_matrix = customer_stats.merge(customer_regions, on='CustomerID')
    feature_matrix = feature_matrix.merge(
        customers_df[['CustomerID', 'DaysSinceSignup']],
        on='CustomerID'
    )
    feature_matrix = feature_matrix.merge(category_preferences, on='CustomerID')

    return feature_matrix

def create_lookalike_model(feature_matrix):
    # Normalize features
    scaler = StandardScaler()
    features_normalized = scaler.fit_transform(feature_matrix.drop('CustomerID', axis=1))

    # Calculate similarity matrix
    similarity_matrix = cosine_similarity(features_normalized)

    return similarity_matrix, feature_matrix['CustomerID'].values

def get_top_lookalikes(customer_id, similarity_matrix, customer_ids, n=3):
    # Get customer index
    customer_idx = np.where(customer_ids == customer_id)[0][0]

    # Get similarity scores for this customer
    similarity_scores = similarity_matrix[customer_idx]

    # Get top N similar customers (excluding self)
    similar_indices = np.argsort(similarity_scores)[::-1][1:n+1]
    similar_customers = customer_ids[similar_indices]
    similar_scores = similarity_scores[similar_indices]

    return list(zip(similar_customers, similar_scores))

def generate_lookalike_csv():
    # Process data and create model
    feature_matrix = preprocess_data()
    similarity_matrix, customer_ids = create_lookalike_model(feature_matrix)

    # Generate lookalikes for first 20 customers
    lookalike_dict = {}
    for i in range(20):
        customer_id = f'C{str(i+1).zfill(4)}'
        if customer_id in customer_ids:
            lookalikes = get_top_lookalikes(customer_id, similarity_matrix, customer_ids)
            lookalike_dict[customer_id] = [
                {'customer_id': str(cust), 'similarity_score': float(score)}
                for cust, score in lookalikes
            ]

    # Save to CSV
    results_df = pd.DataFrame({
        'CustomerID': list(lookalike_dict.keys()),
        'Lookalikes': [json.dumps(v) for v in lookalike_dict.values()]
    })
    results_df.to_csv('FirstName_LastName_Lookalike.csv', index=False)

    return lookalike_dict

if __name__ == "__main__":
    lookalike_results = generate_lookalike_csv()

    # Print results for verification
    for customer_id, lookalikes in lookalike_results.items():
        print(f"\nCustomer {customer_id} lookalikes:")
        for idx, lookalike in enumerate(lookalikes, 1):
            print(f"{idx}. Customer {lookalike['customer_id']} (Similarity: {lookalike['similarity_score']:.3f})")


Customer C0001 lookalikes:
1. Customer C0120 (Similarity: 0.782)
2. Customer C0112 (Similarity: 0.751)
3. Customer C0192 (Similarity: 0.736)

Customer C0002 lookalikes:
1. Customer C0106 (Similarity: 0.913)
2. Customer C0159 (Similarity: 0.898)
3. Customer C0134 (Similarity: 0.874)

Customer C0003 lookalikes:
1. Customer C0129 (Similarity: 0.811)
2. Customer C0151 (Similarity: 0.769)
3. Customer C0195 (Similarity: 0.754)

Customer C0004 lookalikes:
1. Customer C0113 (Similarity: 0.935)
2. Customer C0104 (Similarity: 0.821)
3. Customer C0012 (Similarity: 0.759)

Customer C0005 lookalikes:
1. Customer C0007 (Similarity: 0.927)
2. Customer C0140 (Similarity: 0.858)
3. Customer C0186 (Similarity: 0.837)

Customer C0006 lookalikes:
1. Customer C0187 (Similarity: 0.786)
2. Customer C0168 (Similarity: 0.733)
3. Customer C0171 (Similarity: 0.714)

Customer C0007 lookalikes:
1. Customer C0005 (Similarity: 0.927)
2. Customer C0140 (Similarity: 0.852)
3. Customer C0186 (Similarity: 0.751)

Custo