In [20]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

# Load data
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [21]:
# Merge datasets for a comprehensive view
merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")


In [23]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TransactionID    1000 non-null   object 
 1   CustomerID       1000 non-null   object 
 2   ProductID        1000 non-null   object 
 3   TransactionDate  1000 non-null   object 
 4   Quantity         1000 non-null   int64  
 5   TotalValue       1000 non-null   float64
 6   Price_x          1000 non-null   float64
 7   CustomerName     1000 non-null   object 
 8   Region           1000 non-null   object 
 9   SignupDate       1000 non-null   object 
 10  ProductName      1000 non-null   object 
 11  Category         1000 non-null   object 
 12  Price_y          1000 non-null   float64
dtypes: float64(3), int64(1), object(9)
memory usage: 109.4+ KB


In [24]:
# Aggregate transaction data by customer
customer_features = merged_data.groupby("CustomerID").agg({
    "TotalValue": "sum",
    "Quantity": "sum",
    "Price_x": "mean",
    "Category": lambda x: ",".join(x),
    "Region": "first"
}).reset_index()

In [25]:
# Encode categorical features
encoder = LabelEncoder()
customer_features["Region"] = encoder.fit_transform(customer_features["Region"])
customer_features["Category"] = encoder.fit_transform(customer_features["Category"])


In [36]:
# Normalize numeric features
numeric_features = ["TotalValue", "Quantity", "Price_x"]
customer_features[numeric_features] = customer_features[numeric_features].apply(lambda x: (x - x.mean()) / x.std())
print(customer_features[numeric_features])

     TotalValue  Quantity   Price_x
0     -0.061546 -0.121726  0.094432
1     -0.875535 -0.446873 -0.901742
2     -0.404836  0.203421 -1.091357
3      1.029949  1.666584 -0.446576
4     -0.781956 -0.934594  0.284863
..          ...       ...       ...
194    0.826968 -0.121726  2.084347
195   -0.839572 -0.609447 -0.641459
196   -1.383486 -1.584888 -0.459940
197   -0.811945 -0.609447 -0.303440
198    0.704589  0.528568  0.355222

[199 rows x 3 columns]


In [35]:
# Compute similarity
feature_matrix = customer_features.drop(columns=["CustomerID"])
similarity_matrix = cosine_similarity(feature_matrix)
print(feature_matrix)
print(similarity_matrix)

     TotalValue  Quantity   Price_x  Category  Region
0     -0.061546 -0.121726  0.094432       155       3
1     -0.875535 -0.446873 -0.901742       170       0
2     -0.404836  0.203421 -1.091357        69       3
3      1.029949  1.666584 -0.446576       110       3
4     -0.781956 -0.934594  0.284863       116       0
..          ...       ...       ...       ...     ...
194    0.826968 -0.121726  2.084347       160       1
195   -0.839572 -0.609447 -0.641459       151       1
196   -1.383486 -1.584888 -0.459940        59       1
197   -0.811945 -0.609447 -0.303440       123       1
198    0.704589  0.528568  0.355222        83       0

[199 rows x 5 columns]
[[1.         0.99978227 0.99955317 ... 0.99935692 0.99990436 0.99974096]
 [0.99978227 1.         0.99898546 ... 0.99939301 0.99995917 0.9998206 ]
 [0.99955317 0.99898546 1.         ... 0.99901894 0.99925603 0.99874596]
 ...
 [0.99935692 0.99939301 0.99901894 ... 1.         0.99956567 0.99872269]
 [0.99990436 0.99995917 0.99925

In [29]:
# Generate Lookalike Recommendations
lookalike_results = {}
for idx, customer_id in enumerate(customer_features["CustomerID"]):
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similar_customers = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:4]  # Top 3 similar customers
    lookalike_results[customer_id] = [(customer_features.iloc[i[0]]["CustomerID"], i[1]) for i in similar_customers]


In [33]:
# Prepare Lookalike.csv
lookalike_list = []
for cust_id, recommendations in lookalike_results.items():
    for rec in recommendations:
        lookalike_list.append({"CustomerID": cust_id, "SimilarCustomerID": rec[0], "Score": rec[1]})
print(lookalike_list)

[{'CustomerID': 'C0001', 'SimilarCustomerID': 'C0152', 'Score': 0.9999971019111189}, {'CustomerID': 'C0001', 'SimilarCustomerID': 'C0181', 'Score': 0.9999931087277139}, {'CustomerID': 'C0001', 'SimilarCustomerID': 'C0133', 'Score': 0.9999930182385431}, {'CustomerID': 'C0002', 'SimilarCustomerID': 'C0027', 'Score': 0.9999988685147337}, {'CustomerID': 'C0002', 'SimilarCustomerID': 'C0178', 'Score': 0.9999955317324155}, {'CustomerID': 'C0002', 'SimilarCustomerID': 'C0177', 'Score': 0.9999890630040126}, {'CustomerID': 'C0003', 'SimilarCustomerID': 'C0034', 'Score': 0.999909781189737}, {'CustomerID': 'C0003', 'SimilarCustomerID': 'C0030', 'Score': 0.9999091999410602}, {'CustomerID': 'C0003', 'SimilarCustomerID': 'C0031', 'Score': 0.9999039909102194}, {'CustomerID': 'C0004', 'SimilarCustomerID': 'C0102', 'Score': 0.9999616098352663}, {'CustomerID': 'C0004', 'SimilarCustomerID': 'C0113', 'Score': 0.9999600104531012}, {'CustomerID': 'C0004', 'SimilarCustomerID': 'C0163', 'Score': 0.99994798315

In [32]:
lookalike_df = pd.DataFrame(lookalike_list)
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike model completed and Lookalike.csv saved!")


Lookalike model completed and Lookalike.csv saved!
