In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load the data
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [19]:
def create_customer_features():
    # Create customer transaction features
    customer_features = transactions.groupby('CustomerID').agg({
        'TransactionID': 'count',
        'TotalValue': ['sum', 'mean'],
        'Quantity': ['sum', 'mean']
    })

    # Flatten the multi-level column index
    customer_features.columns = ['_'.join(col) for col in customer_features.columns] # Flatten column names
    customer_features = customer_features.reset_index()
    # Add customer profile features
    customer_features = customers.merge(customer_features, on='CustomerID', how='left')

    # Convert signup date to numeric feature (days since first signup)
    customer_features['SignupDate'] = pd.to_datetime(customer_features['SignupDate'])
    customer_features['days_since_signup'] = (customer_features['SignupDate'].max() -
                                            customer_features['SignupDate']).dt.days

    # One-hot encode region
    customer_features = pd.get_dummies(customer_features, columns=['Region'])

    # Fill NaN values with 0
    customer_features = customer_features.fillna(0) # Fill NaN with 0

    return customer_features


In [20]:
def find_lookalikes(customer_features, target_customers, n_recommendations=3):
    # Prepare features for similarity calculation
    features_for_similarity = customer_features.drop(['CustomerID', 'CustomerName', 'SignupDate'], axis=1)

    # Scale features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features_for_similarity)

    # Calculate similarity matrix
    similarity_matrix = cosine_similarity(scaled_features)

    # Generate recommendations
    recommendations = []
    for target_id in target_customers:
        target_idx = customer_features[customer_features['CustomerID'] == target_id].index[0]
        similar_indices = similarity_matrix[target_idx].argsort()[::-1][1:n_recommendations+1]

        for idx in similar_indices:
            similar_id = customer_features.iloc[idx]['CustomerID']
            score = similarity_matrix[target_idx][idx]
            recommendations.append({
                'CustomerID': target_id,
                'SimilarCustomerID': similar_id,
                'SimilarityScore': score
            })

    return pd.DataFrame(recommendations)

In [22]:
# Create customer features
customer_features = create_customer_features()

In [24]:
# Generate recommendations for customers C0001-C0020
target_customers = [f'C{str(i).zfill(4)}' for i in range(1, 21)]
recommendations = find_lookalikes(customer_features, target_customers)

In [27]:
# Save recommendations
recommendations.to_csv('FirstName_LastName_Lookalike.csv', index=False)

In [28]:
# Display sample recommendations
print("Sample recommendations:")
print(recommendations.head(10))

Sample recommendations:
  CustomerID SimilarCustomerID  SimilarityScore
0      C0001             C0174         0.963480
1      C0001             C0152         0.951797
2      C0001             C0107         0.947931
3      C0002             C0106         0.921003
4      C0002             C0159         0.909760
5      C0002             C0005         0.906467
6      C0003             C0129         0.866113
7      C0003             C0190         0.860165
8      C0003             C0039         0.779408
9      C0004             C0113         0.982045
