In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from datetime import datetime

# Load the datasets
customers_df = pd.read_csv("Customers.csv")
products_df = pd.read_csv("Products.csv")
transactions_df = pd.read_csv("Transactions.csv")

# Preprocess the transactions data
# Aggregate transaction data for each customer
agg_transactions = transactions_df.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    num_transactions=('TransactionID', 'count'),
    num_products=('ProductID', 'nunique'),
    most_recent_transaction=('TransactionDate', lambda x: (datetime.now() - pd.to_datetime(x).max()).days)
).reset_index()

# Merge with customer data
customer_data = pd.merge(customers_df, agg_transactions, on='CustomerID', how='left')

# Feature engineering: Adding customer profile features
# 1. Region as categorical variable (One-hot encoding)
customer_data = pd.get_dummies(customer_data, columns=['Region'], drop_first=True)

# 2. Time since signup as a numerical feature
customer_data['signup_year'] = pd.to_datetime(customer_data['SignupDate']).dt.year
customer_data['years_since_signup'] = datetime.now().year - customer_data['signup_year']

# Select relevant columns for similarity calculation
features = ['total_spent', 'num_transactions', 'num_products', 'most_recent_transaction', 'years_since_signup']
profile_data = customer_data[features]

# Check for missing values and handle them
print(profile_data.isnull().sum())  # Check missing values for each column

# If there are missing values, fill them with the column's mean or drop rows with NaN values
profile_data.fillna(profile_data.mean(), inplace=True)  # Filling NaN with mean

# Alternatively, you can drop rows with NaN values:
# profile_data.dropna(inplace=True)

# Standardize the features
scaler = StandardScaler()
profile_data_scaled = scaler.fit_transform(profile_data)

# Calculate Cosine Similarity between all customers
similarity_matrix = cosine_similarity(profile_data_scaled)

# Function to get top 3 lookalikes
def get_top_lookalikes(customer_id, num_recommendations=3):
    customer_index = customer_data[customer_data['CustomerID'] == customer_id].index[0]
    similarity_scores = similarity_matrix[customer_index]

    # Sort the similarity scores in descending order, excluding the customer itself
    similar_customers = np.argsort(similarity_scores)[::-1][1:num_recommendations+1]
    similar_scores = similarity_scores[similar_customers]

    # Map customer_id to their similar customers with scores
    similar_customer_ids = customer_data['CustomerID'].iloc[similar_customers].values
    lookalikes = [(customer_id, similar_customer_ids[i], similar_scores[i]) for i in range(len(similar_customer_ids))]

    return lookalikes

# Prepare the Lookalike.csv with recommendations for customers C0001 to C0020
lookalike_recommendations = []
for cust_id in customer_data['CustomerID'][:20]:
    lookalike_recommendations.extend(get_top_lookalikes(cust_id))

# Create DataFrame for the recommendations
lookalike_df = pd.DataFrame(lookalike_recommendations, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

# Save to Lookalike.csv
lookalike_df.to_csv('Lookalike.csv', index=False)

# Show the first few recommendations
print(lookalike_df.head())


total_spent                1
num_transactions           1
num_products               1
most_recent_transaction    1
years_since_signup         0
dtype: int64
  CustomerID LookalikeCustomerID  SimilarityScore
0      C0001               C0152         0.989977
1      C0001               C0180         0.976599
2      C0001               C0106         0.960658
3      C0002               C0010         0.991217
4      C0002               C0029         0.988858


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  profile_data.fillna(profile_data.mean(), inplace=True)  # Filling NaN with mean
