In [5]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load datasets
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')
customers = pd.read_csv('Customers.csv')

# Merge Transactions with Customers and Products
merged = pd.merge(transactions, customers, on="CustomerID", how="left")
merged = pd.merge(merged, products, on="ProductID", how="left")

# Feature Engineering: Aggregating transaction-level data to customer-level
customer_features = merged.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending
    'Quantity': 'sum',    # Total quantity purchased
    'ProductID': 'nunique',  # Number of unique products purchased
    'Category': lambda x: x.value_counts().index[0],  # Most purchased category
    'Region': 'first'     # Region of the customer
}).reset_index()

# One-hot encode categorical columns (Region and Category)
customer_features_encoded = pd.get_dummies(customer_features, columns=['Region', 'Category'], drop_first=True)

# Normalize numerical features for cosine similarity
scaler = StandardScaler()
numerical_columns = ['TotalValue', 'Quantity', 'ProductID']
customer_features_encoded[numerical_columns] = scaler.fit_transform(customer_features_encoded[numerical_columns])

# Compute Cosine Similarity Matrix
customer_ids = customer_features_encoded['CustomerID']
feature_matrix = customer_features_encoded.drop(columns=['CustomerID'])
similarity_matrix = cosine_similarity(feature_matrix)

# Function to find top 3 lookalikes for a given customer ID
def find_top_lookalikes(customer_index, similarity_matrix, customer_ids, top_n=3):
    customer_similarities = similarity_matrix[customer_index]
    similar_indices = np.argsort(-customer_similarities)[1:top_n + 1]  # Exclude itself (index 0)
    similar_customers = customer_ids.iloc[similar_indices].tolist()
    similarity_scores = customer_similarities[similar_indices].tolist()
    return list(zip(similar_customers, similarity_scores))

# Generate lookalikes for first 20 customers
lookalike_results = {}
for i, customer_id in enumerate(customer_ids[:20]):
    lookalike_results[customer_id] = find_top_lookalikes(i, similarity_matrix, customer_ids)

# Convert results to the required "Lookalike.csv" format
lookalike_data = []
for cust_id, similar_customers in lookalike_results.items():
    for similar_cust_id, score in similar_customers:
        lookalike_data.append([cust_id, similar_cust_id, score])

lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'SimilarCustomerID', 'SimilarityScore'])

# Save to CSV
lookalike_df.to_csv('Harshita_Daga_Lookalike.csv', index=False)

print("Lookalike model completed. Results saved to Lookalike.csv.")


Lookalike model completed. Results saved to Lookalike.csv.
