In [20]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [9]:
customers=pd.read_csv("Customers.csv")
products=pd.read_csv("Products.csv")
transactions=pd.read_csv("Transactions.csv")

In [10]:
# Convert date columns to datetime format
customers["SignupDate"]=pd.to_datetime(customers["SignupDate"])
transactions["TransactionDate"]=pd.to_datetime(transactions["TransactionDate"])

In [21]:
# Merge datasets
transactions_merged=transactions.merge(customers,on="CustomerID",how="left")
transactions_merged=transactions_merged.merge(products,on="ProductID",how="left")

In [12]:
# Filter for the first 20 customers
first_20_customers =customers[customers["CustomerID"].isin([f"C{str(i).zfill(4)}" for i in range(1,21)])]

In [13]:
# Feature engineering: Create aggregated features for each customer
customer_features=(
    transactions_merged.groupby("CustomerID")
    .agg(
        total_spent=("TotalValue","sum"),
        total_transactions=("TransactionID","count"),
        avg_quantity=("Quantity","mean"),
        num_unique_products=("ProductID","nunique"),
    )
    .reset_index()
)

In [14]:
# Merge with demographic data (Region encoded as dummy variables)
customer_features=customer_features.merge(
    pd.get_dummies(customers[["CustomerID","Region"]],columns=["Region"]),on="CustomerID",how="left"
)

In [15]:
# Standardize features
scaler = StandardScaler()
feature_cols = [col for col in customer_features.columns if col != "CustomerID"]
features_scaled = scaler.fit_transform(customer_features[feature_cols])

In [16]:
# Calculate cosine similarity between customers
similarity_matrix = cosine_similarity(features_scaled)

In [17]:
# Create a lookalike map for the first 20 customers
lookalike_map={}
for i, customer_id in enumerate(customer_features["CustomerID"]):
    if customer_id in first_20_customers["CustomerID"].values:
        similarities=similarity_matrix[i]
        similar_customers = sorted(
            [
                (customer_features["CustomerID"].iloc[j], similarities[j])
                for j in range(len(similarities))
                if j != i
            ],
            key=lambda x: x[1],
            reverse=True,
        )
        lookalike_map[customer_id]=similar_customers[:3]

In [18]:
# Display lookalike map
for cust_id, lookalikes in lookalike_map.items():
    print(f"Customer {cust_id} lookalikes:")
    for similar_cust, score in lookalikes:
        print(f"  - {similar_cust}: {score:.4f}")

Customer C0001 lookalikes:
  - C0107: 0.9968
  - C0174: 0.9907
  - C0048: 0.9891
Customer C0002 lookalikes:
  - C0142: 0.9788
  - C0177: 0.9714
  - C0186: 0.9659
Customer C0003 lookalikes:
  - C0091: 0.9085
  - C0120: 0.9047
  - C0129: 0.9029
Customer C0004 lookalikes:
  - C0113: 0.9842
  - C0104: 0.9688
  - C0102: 0.9648
Customer C0005 lookalikes:
  - C0186: 0.9986
  - C0140: 0.9759
  - C0159: 0.9745
Customer C0006 lookalikes:
  - C0168: 0.9807
  - C0171: 0.9276
  - C0133: 0.9156
Customer C0007 lookalikes:
  - C0159: 0.9946
  - C0115: 0.9929
  - C0092: 0.9735
Customer C0008 lookalikes:
  - C0098: 0.9389
  - C0139: 0.9358
  - C0049: 0.9229
Customer C0009 lookalikes:
  - C0198: 0.9588
  - C0119: 0.9387
  - C0103: 0.9114
Customer C0010 lookalikes:
  - C0166: 0.9739
  - C0197: 0.9690
  - C0073: 0.9548
Customer C0011 lookalikes:
  - C0107: 0.9805
  - C0048: 0.9804
  - C0001: 0.9746
Customer C0012 lookalikes:
  - C0153: 0.9712
  - C0013: 0.9706
  - C0102: 0.9703
Customer C0013 lookalikes:
 

In [19]:
# Save the lookalike map to a CSV file
lookalike_df = pd.DataFrame([
    {"CustomerID": cust_id, "Lookalikes": str(lookalikes)}
    for cust_id, lookalikes in lookalike_map.items()
])
lookalike_df.to_csv("Lookalike.csv", index=False)
print("\nLookalike map saved to 'Lookalike.csv'")


Lookalike map saved to 'Lookalike.csv'
