In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [2]:
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

In [3]:
#Merge datasets
merged = transactions.merge(products, on="ProductID", how="left").merge(customers, on="CustomerID", how="left")

In [6]:
print(merged.columns)

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'ProductName', 'Category',
       'Price_y', 'CustomerName', 'Region', 'SignupDate'],
      dtype='object')


In [7]:

# Feature Engineering
print("Feature Engineering: Creating customer-product interaction matrix and aggregating features...")
customer_product_matrix = pd.pivot_table(
    merged, 
    values="TotalValue", 
    index="CustomerID", 
    columns="ProductID", 
    aggfunc="sum", 
    fill_value=0
)

Feature Engineering: Creating customer-product interaction matrix and aggregating features...


In [8]:
 #Add customer demographic features to the matrix
customer_features = customers.set_index("CustomerID").drop(columns=["CustomerName"])
combined_features = customer_product_matrix.join(customer_features)


In [10]:
print(combined_features.dtypes)


P001          float64
P002          float64
P003          float64
P004          float64
P005          float64
               ...   
P098          float64
P099          float64
P100          float64
Region         object
SignupDate     object
Length: 102, dtype: object


In [11]:
combined_features = pd.get_dummies(combined_features, drop_first=True)


In [12]:
combined_features = combined_features.fillna(0)  # or use another imputation method


In [14]:
#Normalize data
scaler = StandardScaler()
normalized_features = scaler.fit_transform(combined_features)


In [15]:
# Compute cosine similarity
print("Calculating cosine similarity between customers...")
similarity_matrix = cosine_similarity(normalized_features)
similarity_df = pd.DataFrame(similarity_matrix, index=combined_features.index, columns=combined_features.index)

Calculating cosine similarity between customers...


In [16]:
# Get top 3 similar customers for each customer
lookalike_data = {}
for customer_id in similarity_df.index[:20]:  # First 20 customers
    similar_customers = similarity_df[customer_id].nlargest(4).iloc[1:]  # Top 3 (excluding self)
    lookalike_data[customer_id] = [(sim_id, round(score, 2)) for sim_id, score in similar_customers.items()]


In [17]:
# Create Lookalike.csv
print("Saving Lookalike.csv...")
lookalike_df = pd.DataFrame({
    "CustomerID": lookalike_data.keys(),
    "Similar_Customers_and_Scores": [lookalike_data[cust_id] for cust_id in lookalike_data.keys()]
})
lookalike_df.to_csv("Lookalike.csv", index=False)
print("Lookalike.csv saved successfully!")

Saving Lookalike.csv...
Lookalike.csv saved successfully!


In [18]:
print("\n--- Lookalike Mapping for First 20 Customers ---")
print(lookalike_df)


--- Lookalike Mapping for First 20 Customers ---
   CustomerID                   Similar_Customers_and_Scores
0       C0001  [(C0104, 0.11), (C0194, 0.09), (C0065, 0.08)]
1       C0002  [(C0166, 0.51), (C0091, 0.21), (C0030, 0.19)]
2       C0003  [(C0134, 0.17), (C0181, 0.16), (C0144, 0.12)]
3       C0004  [(C0175, 0.19), (C0105, 0.15), (C0070, 0.14)]
4       C0005  [(C0096, 0.14), (C0023, 0.13), (C0055, 0.12)]
5       C0006   [(C0040, 0.13), (C0126, 0.1), (C0171, 0.08)]
6       C0007  [(C0053, 0.39), (C0118, 0.31), (C0079, 0.26)]
7       C0008  [(C0091, 0.14), (C0165, 0.13), (C0028, 0.13)]
8       C0009  [(C0162, 0.08), (C0072, 0.06), (C0140, 0.05)]
9       C0010  [(C0143, 0.16), (C0094, 0.15), (C0092, 0.14)]
10      C0011  [(C0135, 0.14), (C0120, 0.14), (C0173, 0.11)]
11      C0012  [(C0164, 0.15), (C0173, 0.15), (C0158, 0.13)]
12      C0013   [(C0169, 0.2), (C0099, 0.14), (C0092, 0.13)]
13      C0014  [(C0159, 0.08), (C0075, 0.05), (C0128, 0.05)]
14      C0015  [(C0059, 0.15), (C00