In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from datetime import datetime

def prepare_customer_features(customers_df, products_df, transactions_df):
    customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
    current_date = datetime.now()
    customers_df['CustomerLifetime'] = (current_date - customers_df['SignupDate']).dt.days

    region_dummies = pd.get_dummies(customers_df['Region'], prefix='Region')

    customer_transactions = transactions_df.groupby('CustomerID').agg({
        'TransactionID': 'count',
        'TotalValue': ['sum', 'mean'],
        'Quantity': ['sum', 'mean']
    }).reset_index()
    customer_transactions.columns = ['CustomerID', 'TransactionCount', 'TotalSpend', 
                                      'AvgTransactionValue', 'TotalQuantity', 'AvgQuantity']

    merged_trans = transactions_df.merge(products_df[['ProductID', 'Category']], on='ProductID')
    category_preferences = pd.crosstab(merged_trans['CustomerID'], 
                                        merged_trans['Category'], 
                                        values=merged_trans['Quantity'], 
                                        aggfunc='sum', 
                                        normalize='index').fillna(0)

    feature_matrix = (customers_df[['CustomerID', 'CustomerLifetime']]
                      .merge(customer_transactions, on='CustomerID', how='left')
                      .merge(region_dummies, left_index=True, right_index=True)
                      .merge(category_preferences, left_on='CustomerID', right_index=True, how='left'))
    feature_matrix = feature_matrix.fillna(0)

    scaler = StandardScaler()
    feature_cols = feature_matrix.columns.difference(['CustomerID'])
    feature_matrix[feature_cols] = scaler.fit_transform(feature_matrix[feature_cols])
    
    return feature_matrix

def find_similar_customers(customers_df, feature_matrix, customer_id, n_recommendations=3):
    customer_features = feature_matrix[feature_matrix['CustomerID'] == customer_id].iloc[0]
    feature_cols = feature_matrix.columns.difference(['CustomerID'])
    distances = []

    for _, row in feature_matrix.iterrows():
        if row['CustomerID'] != customer_id:
            distance = np.sqrt(((customer_features[feature_cols] - row[feature_cols]) ** 2).sum())
            similarity_score = 1 / (1 + distance)
            distances.append({'CustomerID': row['CustomerID'], 'SimilarityScore': similarity_score})

    recommendations = (pd.DataFrame(distances)
                       .sort_values('SimilarityScore', ascending=False)
                       .head(n_recommendations))
    recommendations = recommendations.merge(customers_df[['CustomerID', 'CustomerName']], on='CustomerID')

    return recommendations[['CustomerID', 'CustomerName', 'SimilarityScore']]


In [3]:
import pandas as pd

# Read the data
customers_df = pd.read_csv('data/Customers.csv')
products_df = pd.read_csv('data/Products.csv')
transactions_df = pd.read_csv('data/Transactions.csv')

model = prepare_customer_features(customers_df, products_df, transactions_df)

similarity_results = []

for customer_id in customers_df['CustomerID'][:20]:
    similar_customers = find_similar_customers(customers_df, model, customer_id=customer_id)
    similar_ids = similar_customers['CustomerID'].tolist()[:3]  # Top 3 similar customers
    similarity_results.append([customer_id] + similar_ids)

similarity_df = pd.DataFrame(similarity_results, columns=['CustomerID', 'SimilarID1', 'SimilarID2', 'SimilarID3'])

similarity_df.to_excel('similar_customers.xlsx', index=False)
print("Similar customers saved to 'similar_customers.xlsx'")


Similar customers saved to 'similar_customers.xlsx'
