In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Load the datasets
customers = pd.read_csv('/content/Customers.csv')
transactions = pd.read_csv('/content/Transactions.csv')

In [None]:
# Step 1: Merge the datasets to form a unified customer profile
merged_data = transactions.merge(customers, on='CustomerID', how='left')

In [None]:
# Step 2: Aggregate the data to create customer profiles
customer_profiles = (
    merged_data.groupby('CustomerID')
    .agg({
        'TotalValue': 'sum',        # Total spending
        'Quantity': 'sum',          # Total items purchased
        'TransactionID': 'count',   # Number of transactions
        'Region': 'first'           # Region of the customer
    })
    .reset_index()
)

In [None]:
# Step 3: One-hot encode categorical features (e.g., Region)
customer_profiles = pd.get_dummies(customer_profiles, columns=['Region'])

In [None]:
# Step 4: Normalize the numerical features for similarity calculations
scaler = MinMaxScaler()
numerical_features = ['TotalValue', 'Quantity', 'TransactionID']
customer_profiles[numerical_features] = scaler.fit_transform(customer_profiles[numerical_features])


In [None]:
# Step 5: Compute cosine similarity matrix
customer_ids = customer_profiles['CustomerID']
feature_matrix = customer_profiles.drop(columns=['CustomerID'])
similarity_matrix = cosine_similarity(feature_matrix)

In [None]:
# Step 6: Create a lookalike map for the first 20 customers (C0001 - C0020)
lookalike_map = {}
for idx, customer_id in enumerate(customer_ids[:20]):  # First 20 customers
    # Get similarity scores for the customer
    similarities = list(enumerate(similarity_matrix[idx]))
    # Exclude self-similarity and sort by score in descending order
    top_similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[1:4]
    # Map customer_id to top 3 similar customers with their scores
    lookalike_map[customer_id] = [
        (customer_ids[sim_idx], round(sim_score, 4)) for sim_idx, sim_score in top_similarities
    ]

In [None]:
# Step 7: Save the lookalike map to a CSV file
lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': lookalikes}
    for cust_id, lookalikes in lookalike_map.items()
])

# Save to CSV file
lookalike_df.to_csv('/content/lookalike.csv', index=False)

# Display the lookalike map
lookalike_df.head()

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[(C0107, 0.9998), (C0137, 0.9997), (C0174, 0.9..."
1,C0002,"[(C0142, 0.9991), (C0177, 0.998), (C0178, 0.997)]"
2,C0003,"[(C0133, 0.9966), (C0190, 0.9966), (C0174, 0.9..."
3,C0004,"[(C0113, 0.9994), (C0102, 0.9979), (C0012, 0.9..."
4,C0005,"[(C0186, 0.9997), (C0159, 0.9996), (C0007, 0.9..."
