<a href="https://colab.research.google.com/github/kavyakapoor200/Ecommerce-insight-toolkit/blob/main/Kavya_Kapoor_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [65]:
#Using the merged dataset we created
import pandas as pd
merged_data = pd.read_csv("Merged_Data.csv")

# Creating aggregated features for customers for the lookalike model
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',   # summin up the Total spending
    'TransactionID': 'count',  # counting Number of transactions
    'ProductID': lambda x: x.nunique(),  # counting Unique products purchases
    'Category': lambda x: x.mode()[0]  # Most purchased product category as [0] is mentioned and mode here is calculating the most common value
}).reset_index() #resetting the group df and making customerID a column again

# Merging with profile features
customer_features = customer_features.merge(customers[['CustomerID', 'Region']], on='CustomerID')

# Renaming columns for clarity
customer_features.rename(columns={
    'TotalValue': 'TotalSpend',
    'TransactionID': 'TransactionCount',
    'ProductID': 'UniqueProducts',
    'Category': 'FavoriteCategory'
}, inplace=True)

print(customer_features.head())

  CustomerID  TotalSpend  TransactionCount  UniqueProducts FavoriteCategory  \
0      C0001     3354.52                 5               5      Electronics   
1      C0002     1862.74                 4               4         Clothing   
2      C0003     2725.38                 4               4       Home Decor   
3      C0004     5354.88                 8               8            Books   
4      C0005     2034.24                 3               3      Electronics   

          Region  
0  South America  
1           Asia  
2  South America  
3  South America  
4           Asia  


In [66]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Encode categorical variables
customer_features = pd.get_dummies(customer_features, columns=['Region', 'FavoriteCategory']) #one hot encoding is done on regiona and favourite category #here each value which is unique gets converted into new column like region as region_southamerica #it is done because algo can't get wordtype data
# Normalize numerical features including the categorical variables
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop(columns=['CustomerID'])) #leaving customerId as it is not a scalable feature it is sued in identification #scaled features is a matrix

# Compute similarity matrix
similarity_matrix = cosine_similarity(scaled_features) #calculating similarity between customers using the info in scaled features
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])
print(similarity_df.head()) #visualizing the similarity score of different customers

# Get top 3 similar customers for first 20 customers for checking
lookalikes = {}
for customer in customer_features['CustomerID'][:20]: #we are finding similarity based on first 20 customers
    similar_customers = similarity_df[customer].sort_values(ascending=False)[1:4]  # Exclude self-similarity as it will always be 1
    lookalikes[customer] = similar_customers.index.tolist()

print(lookalikes)

CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000 -0.285590  0.255542  0.197848  0.334010  0.250969   
C0002      -0.285590  1.000000 -0.191848 -0.464241  0.450607 -0.334847   
C0003       0.255542 -0.191848  1.000000  0.020218 -0.144352  0.276483   
C0004       0.197848 -0.464241  0.020218  1.000000 -0.549339  0.611017   
C0005       0.334010  0.450607 -0.144352 -0.549339  1.000000 -0.266054   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.337738 -0.199337 -0.266436 -0.297421  ...  0.269622  0.945783   
C0002       0.433603 -0.340226  0.472209  0.444493  ... -0.326076 -0.160530   
C0003      -0.159235  0.088885 -0.120111 -0.198306  ...  0.268215  0.321034   
C0004      -0.530930  0.284727 -0.591258 -0.487534  ...  0.710227 -0.019591   
C0005  

In [67]:
# Preparing the lookalike map in the required format
lookalike_map = {}
for customer in customer_features['CustomerID'][:20]:  # First 20 customers
    similar_customers = similarity_df[customer].sort_values(ascending=False)[1:4]  # Exclude self-similarity
    lookalike_map[customer] = [(similar_cust, similarity_df.loc[customer, similar_cust]) for similar_cust in similar_customers.index]

# Output the map
import json
with open('Lookalike_Map.json', 'w') as f:
    json.dump(lookalike_map, f, indent=4)

print("Lookalike Map generated successfully!")
# Convert to DataFrame and save
lookalike_df = pd.DataFrame(lookalike_map)
lookalike_df.to_csv('Lookalike.csv', index=False)
print("Lookalike CSV generated successfully!")

Lookalike Map generated successfully!
Lookalike CSV generated successfully!
