In [1]:
#Step 1: Import Libraries and Load Data

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Preview the datasets
print(customers.head())
print(products.head())
print(transactions.head())


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

In [2]:
#Step 2: Data Preparation

# Merge Transactions with Products
transactions = transactions.merge(products[['ProductID', 'Category']], on='ProductID', how='left')

# Aggregate transaction features
customer_features = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Price': 'mean',
    'Category': lambda x: x.value_counts().index[0]  # Most common category
}).reset_index()

# Merge with Customers dataset
customers = customers.merge(customer_features, on='CustomerID', how='left')
customers.fillna(0, inplace=True)  # Replace NaNs with 0 for missing transaction data

print(customers.head())


  CustomerID        CustomerName         Region  SignupDate  TotalValue  \
0      C0001    Lawrence Carroll  South America  2022-07-10     3354.52   
1      C0002      Elizabeth Lutz           Asia  2022-02-13     1862.74   
2      C0003      Michael Rivera  South America  2024-03-07     2725.38   
3      C0004  Kathleen Rodriguez  South America  2022-10-09     5354.88   
4      C0005         Laura Weber           Asia  2022-08-15     2034.24   

   Quantity       Price     Category  
0      12.0  278.334000  Electronics  
1      10.0  208.920000   Home Decor  
2      14.0  195.707500   Home Decor  
3      23.0  240.636250        Books  
4       7.0  291.603333  Electronics  


In [3]:
#Step 3: Create Similarity Matrix

# Select numerical features for similarity calculation
numerical_features = ['TotalValue', 'Quantity', 'Price']
scaler = StandardScaler()
customers_scaled = scaler.fit_transform(customers[numerical_features])

# Compute cosine similarity
similarity_matrix = cosine_similarity(customers_scaled)

# Convert to a DataFrame for easier handling
similarity_df = pd.DataFrame(similarity_matrix, index=customers['CustomerID'], columns=customers['CustomerID'])

print(similarity_df.head())


CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000 -0.007041 -0.631946 -0.859581  0.856884  0.535879   
C0002      -0.007041  1.000000  0.778325 -0.488415  0.509339 -0.845597   
C0003      -0.631946  0.778325  1.000000  0.165898 -0.142181 -0.992820   
C0004      -0.859581 -0.488415  0.165898  1.000000 -0.989955 -0.052640   
C0005       0.856884  0.509339 -0.142181 -0.989955  1.000000  0.024587   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.984218 -0.945042  0.823996 -0.382075  ...  0.931258  0.819770   
C0002      -0.114098 -0.243549  0.560490  0.926798  ...  0.339062  0.562188   
C0003      -0.699051  0.412339 -0.081705  0.956619  ... -0.316225 -0.075445   
C0004      -0.774871  0.963187 -0.985980 -0.128577  ... -0.956532 -0.976055   
C0005  

In [5]:
#Step 4: Find Top 3 Lookalikes


# Define a function to get the top N similar customers
def get_top_lookalikes(customer_id, similarity_df, top_n=3):
    """
    Get top N similar customers for a given customer ID.

    Parameters:
        customer_id (str): The ID of the customer to find similar customers for.
        similarity_df (DataFrame): Pairwise similarity DataFrame.
        top_n (int): Number of similar customers to return.

    Returns:
        List of tuples containing similar customer IDs and similarity scores.
    """
    # Sort similarities in descending order, excluding the customer itself
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:top_n+1]
    return list(zip(similar_customers.index, similar_customers.values))

# Generate lookalike recommendations for the first 20 customers
lookalikes = {}
for customer_id in customers['CustomerID'][:20]:  # First 20 customers
    lookalikes[customer_id] = get_top_lookalikes(customer_id, similarity_df)

# Save results to Lookalike.csv
lookalike_list = []
for cust_id, similar_list in lookalikes.items():
    for similar_cust_id, score in similar_list:
        lookalike_list.append([cust_id, similar_cust_id, round(score, 4)])

lookalike_df = pd.DataFrame(lookalike_list, columns=['cust_id', 'similar_cust_id', 'similarity_score'])
lookalike_df.to_csv("Lookalike.csv", index=False)
print("Lookalike.csv has been generated successfully!")



Lookalike.csv has been generated successfully!


In [6]:
#Step 5: Evaluate the Model


# Evaluation: Compare features between customers and their top lookalikes
def evaluate_lookalikes(lookalike_df, customers, features):
    """
    Evaluate the lookalike model by comparing key features of customers and their lookalikes.

    Parameters:
        lookalike_df (DataFrame): Lookalike mapping with cust_id, similar_cust_id, and similarity_score.
        customers (DataFrame): Customer dataset with features.
        features (list): List of features to compare.

    Returns:
        DataFrame: Summary of feature differences between customers and their lookalikes.
    """
    evaluations = []
    for _, row in lookalike_df.iterrows():
        cust_id = row['cust_id']
        similar_cust_id = row['similar_cust_id']

        # Extract feature values
        customer_features = customers[customers['CustomerID'] == cust_id][features].values.flatten()
        similar_features = customers[customers['CustomerID'] == similar_cust_id][features].values.flatten()

        # Calculate feature differences
        feature_differences = abs(customer_features - similar_features)

        # Store results
        evaluations.append({
            'cust_id': cust_id,
            'similar_cust_id': similar_cust_id,
            'similarity_score': row['similarity_score'],
            'avg_feature_difference': round(feature_differences.mean(), 4)
        })

    return pd.DataFrame(evaluations)

# Define features to compare
comparison_features = ['TotalValue', 'Quantity', 'Price']  # Use transaction-based features for evaluation
evaluation_results = evaluate_lookalikes(lookalike_df, customers, comparison_features)

# Display evaluation results
print("Evaluation Summary:")
print(evaluation_results.head())

# Save evaluation results to CSV
evaluation_results.to_csv("Lookalike_Evaluation.csv", index=False)
print("Lookalike_Evaluation.csv has been generated successfully!")


Evaluation Summary:
  cust_id similar_cust_id  similarity_score  avg_feature_difference
0   C0001           C0135            0.9989                 81.0120
1   C0001           C0092            0.9985                120.8995
2   C0001           C0085            0.9957                277.7687
3   C0002           C0029            0.9999                 25.9908
4   C0002           C0077            0.9959                355.5644
Lookalike_Evaluation.csv has been generated successfully!
