In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime

def load_and_prepare_data():
    """
    Load and prepare the dataset for analysis
    """
    # Load datasets
    customers = pd.read_csv('data/Customers.csv')
    products = pd.read_csv('data/Products.csv')
    transactions = pd.read_csv('data/Transactions.csv')
    
    # Convert dates to datetime
    customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
    transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])
    
    return customers, products, transactions


def create_customer_features(customers, transactions, products):

    # RFM Analysis
    
    # Recency
    max_date = transactions['TransactionDate'].max()
    last_purchase = transactions.groupby('CustomerID')['TransactionDate'].max()
    recency = (max_date - last_purchase).dt.days
    
    # Frequency
    frequency = transactions.groupby('CustomerID').size()
    
    # Monetary
    monetary = transactions.groupby('CustomerID')['TotalValue'].sum()
    
    # Average order value
    avg_order = transactions.groupby('CustomerID')['TotalValue'].mean()
    
    # Region (simple binary features)
    region_dummies = pd.get_dummies(customers.set_index('CustomerID')['Region'])
    
    # Combine features
    features = pd.DataFrame({
        'Recency': recency,
        'Frequency': frequency,
        'Monetary': monetary,
        'AvgOrderValue': avg_order
    })
    
    # Add region
    features = features.join(region_dummies)
    
    # Reset index to make CustomerID a column
    features = features.reset_index()
    
    return features


def calculate_similarity_scores(customer_features):
    """
    Calculate similarity scores between customers
    """
    # Separate CustomerID and features
    customer_ids = customer_features['CustomerID']
    features = customer_features.drop('CustomerID', axis=1)
    feature_names = features.columns

    # Scale features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)
    
    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(scaled_features)
    
    return similarity_matrix, customer_ids

def get_top_lookalikes(customer_id, similarity_matrix, customer_ids, n=3):
    """
    Get top n lookalike customers for a given customer
    """
    customer_index = customer_ids[customer_ids == customer_id].index[0]
    customer_similarities = similarity_matrix[customer_index]
    
    # Get indices of top n similar customers (excluding self)
    similar_indices = np.argsort(customer_similarities)[::-1][1:n+1]
    
    # Create recommendations with similarity scores
    recommendations = []
    for idx in similar_indices:
        recommendations.append({
            'CustomerID': customer_ids.iloc[idx],
            'SimilarityScore': customer_similarities[idx]
        })
    
    return recommendations

# Load and prepare data
customers, products, transactions = load_and_prepare_data()

# Create customer features
customer_features = create_customer_features(customers, transactions, products)

# Calculate similarity scores
similarity_matrix, customer_ids = calculate_similarity_scores(customer_features)

# Generate lookalikes for first 20 customers
lookalike_results = {}

for customer_id in customers['CustomerID'][:20]:  # First 20 customers
    recommendations = get_top_lookalikes(customer_id, similarity_matrix, customer_ids)
    lookalike_results[customer_id] = recommendations

# Create output DataFrame
output_rows = []
for customer_id, recs in lookalike_results.items():
    row = {
        'CustomerID': customer_id,
        'Lookalikes': str([f"{rec['CustomerID']}:{rec['SimilarityScore']:.3f}" for rec in recs])
    }
    output_rows.append(row)

output_df = pd.DataFrame(output_rows)

# Save to CSV
output_df.to_csv('LalitChandra_Routhu_Lookalike.csv', index=False)