In [10]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import pandas as pd

#importing the three datasets and merging and store in data

In [3]:
transactions = pd.read_csv('Transactions.csv')
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")


data = transactions.merge(customers, on="CustomerID", how="left").merge(products, on="ProductID", how="left")


# Feature-Engineering<br>
#here we are finding total 4 things<br>
#1. total value-this tells the total spending of each customer<br>
#2. quantity-this tells total quantity taken by each customer<br>
#3. region- takes most common region <br>
#4. Category- takes the most purchased category<br>

In [4]:
customer_features = data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Region': lambda x: x.mode()[0],
    'Category': lambda x: x.mode()[0]
}).reset_index()



# Encoding the categorigal values in the dataframe

In [5]:
customer_features = pd.get_dummies(customer_features, columns=['Region', 'Category'], drop_first=True)

# Using standard scaler to normalise 

In [6]:
scaler = StandardScaler()
numerical_cols = ['TotalValue', 'Quantity']
customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])

# This is the main part here, we have calculated the cosine similarity between the customers

In [7]:
customer_ids = customer_features['CustomerID']
features_matrix = customer_features.drop('CustomerID', axis=1).values
similarity_matrix = cosine_similarity(features_matrix)

# After calculating the cosine similarity, we are taking the top three similar customers to C0001 TO C0020 

In [9]:
lookalikes = {}
for i, customer_id in enumerate(customer_ids[:20]):  
    similarity_scores = list(enumerate(similarity_matrix[i]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_3 = [(customer_ids[j], score) for j, score in similarity_scores[1:4]]
    lookalikes[customer_id] = top_3

lookalike_df = pd.DataFrame({
    "CustomerID": customer_ids[:20],  
    "Lookalikes": [lookalikes[cid] for cid in customer_ids[:20]]
})
lookalike_df.to_csv("Lookalike.csv", index=False)
print("Lookalike Model completed for the first 20 customers! Results saved in Lookalike_first_20.csv")


Lookalike Model completed for the first 20 customers! Results saved in Lookalike_first_20.csv


<-------------------------------------------------------END--------------------------------------------------->