In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [None]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


# Function to normalize features
def normalize_features(customer_features):
    logging.info("=" * 15 + " Normalizing feature data " + "=" * 15)

    # Select numeric columns
    numerical_cols = customer_features.select_dtypes(include=['float64', 'int64']).columns

    # Normalize only numeric columns
    scaler = StandardScaler()
    normalized_features = pd.DataFrame(
        scaler.fit_transform(customer_features[numerical_cols]),
        columns=numerical_cols
    )

    # Retain non-numeric columns without modification
    for col in customer_features.columns:
        if col not in numerical_cols:
            normalized_features[col] = customer_features[col].values

    return normalized_features


# Function to calculate similarity and find lookalikes
def calculate_lookalikes(customers, transactions, products):
    logging.info("=" * 15 + " Calculating Lookalikes " + "=" * 15)

    # Merge datasets to include transaction and product details
    merged_data = pd.merge(transactions, customers, on='CustomerID', how='left')
    merged_data = pd.merge(merged_data, products, on='ProductID', how='left')

    # Aggregate data to get customer-level features
    customer_features = merged_data.groupby('CustomerID').agg({
        'Quantity': 'sum',  # Total quantity purchased
        'TotalValue': 'sum',  # Total value of transactions
        'Price_y': 'mean',  # Average product price
    }).reset_index()

    # Normalize the features (exclude CustomerID)
    customer_features_normalized = normalize_features(customer_features.drop(columns=['CustomerID']))

    # Compute similarity using cosine similarity (exclude CustomerID column)
    similarity_matrix = cosine_similarity(customer_features_normalized)

    # Find top 3 similar customers for each customer
    lookalikes = {}
    for idx, customer_id in enumerate(customer_features['CustomerID']):
        similarity_scores = list(enumerate(similarity_matrix[idx]))
        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        top_3 = similarity_scores[1:4]  # Exclude the customer itself

        lookalikes[customer_id] = [
            {'CustomerID': customer_features.iloc[sim[0]]['CustomerID'], 'Score': round(sim[1], 3)}
            for sim in top_3
        ]

    return lookalikes


# Function to generate Lookalike.csv
def generate_lookalike_csv(lookalikes, output_file="Lookalike.csv"):
    logging.info("=" * 15 + " Generating Lookalike.csv " + "=" * 15)

    lookalike_data = []
    for customer_id, similar_customers in lookalikes.items():
        for similar in similar_customers:
            lookalike_data.append({
                'CustomerID': customer_id,
                'SimilarCustomerID': similar['CustomerID'],
                'Score': similar['Score']
            })

    lookalike_df = pd.DataFrame(lookalike_data)
    lookalike_df.to_csv(output_file, index=False)
    logging.info(f"Lookalike.csv generated successfully at {output_file}")


# Main function
def main():
    # Load the datasets
    logging.info("=" * 15 + " Loading Data " + "=" * 15)
    customers = pd.read_csv('Customers.csv')
    products = pd.read_csv('Products.csv')
    transactions = pd.read_csv('Transactions.csv')

    # Calculate lookalikes
    lookalikes = calculate_lookalikes(customers, transactions, products)

    # Generate Lookalike.csv
    generate_lookalike_csv(lookalikes)


# Run the script
if __name__ == "__main__":
    main()
