# Task 2:Lookalike Model

In [3]:
#imort the libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
#  Load  the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [7]:
# Merging datasets
# Merging Transactions with Products to include product details
transactions = transactions.merge(products, on='ProductID', how='left')

In [9]:
# Aggregate transaction data by CustomerID
customer_transactions = transactions.groupby('CustomerID').agg({
    'Category': lambda x: ' '.join(x),  # Aggregate product categories
    'TotalValue': 'sum'  # Total spending by customer
}).reset_index()

In [11]:
customer_data = customers.merge(customer_transactions, on='CustomerID', how='left').fillna({'Category': '', 'TotalValue': 0})

In [13]:
#Step Feature Engineering
product_categories_encoded = customer_data['Category'].str.get_dummies(sep=' ')
features = pd.concat([customer_data[['TotalValue']], product_categories_encoded], axis=1)


In [15]:
# Normalize features
normalized_features = StandardScaler().fit_transform(features)

In [17]:
#Compute Cosine Similarity Matrix
similarity_matrix = cosine_similarity(normalized_features)

In [19]:
# Generate Recommendations
    #Loop on 20 Customers
    #Sort Scores in escending order
    #Rounded Scores
customer_ids = customer_data['CustomerID']
lookalike_map = {
    customer_ids[idx]: [
        (customer_ids[i], round(score, 4))
        for i, score in sorted( 
            enumerate(similarity_matrix[idx]),
            key=lambda x: x[1],
            reverse=True
        )[1:4]  # Exclude self and select top 3
    ]
    for idx in range(20)  # Limit to first 20 customers
}


In [21]:
#Saving results to Lookalike.csv
lookalike_df = pd.DataFrame({
    'CustomerID': lookalike_map.keys(),
    'Lookalikes': lookalike_map.values()
})

lookalike_df.to_csv('Harsh_Jolania_Lookalike.csv ', index=False) 

print("Optimized Lookalike.csv has been generated successfully.")

Optimized Lookalike.csv has been generated successfully.


In [23]:
print(lookalike_df.head())

  CustomerID                                         Lookalikes
0      C0001   [(C0152, 1.0), (C0127, 0.9994), (C0174, 0.9916)]
1      C0002  [(C0159, 0.9992), (C0062, 0.9992), (C0144, 0.9...
2      C0003  [(C0106, 0.9987), (C0166, 0.9973), (C0129, 0.9...
3      C0004  [(C0148, 0.9998), (C0012, 0.9996), (C0018, 0.9...
4      C0005  [(C0199, 0.9999), (C0197, 0.9998), (C0140, 0.9...


In [25]:
print(lookalike_df)

   CustomerID                                         Lookalikes
0       C0001   [(C0152, 1.0), (C0127, 0.9994), (C0174, 0.9916)]
1       C0002  [(C0159, 0.9992), (C0062, 0.9992), (C0144, 0.9...
2       C0003  [(C0106, 0.9987), (C0166, 0.9973), (C0129, 0.9...
3       C0004  [(C0148, 0.9998), (C0012, 0.9996), (C0018, 0.9...
4       C0005  [(C0199, 0.9999), (C0197, 0.9998), (C0140, 0.9...
5       C0006      [(C0079, 1.0), (C0124, 1.0), (C0187, 0.9952)]
6       C0007  [(C0069, 0.9981), (C0005, 0.9942), (C0199, 0.9...
7       C0008  [(C0039, 0.9999), (C0090, 0.9989), (C0067, 0.9...
8       C0009     [(C0198, 1.0), (C0072, 0.937), (C0092, 0.936)]
9       C0010  [(C0132, 0.9997), (C0061, 0.9971), (C0142, 0.9...
10      C0011  [(C0107, 0.9997), (C0183, 0.9996), (C0016, 0.9...
11      C0012  [(C0004, 0.9996), (C0148, 0.9988), (C0018, 0.9...
12      C0013  [(C0101, 0.9968), (C0087, 0.9953), (C0165, 0.9...
13      C0014  [(C0151, 0.9999), (C0097, 0.9997), (C0060, 0.9...
14      C0015  [(C0123, 0