# Task 2: Lookalike Model

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Load the datasets
customers_df = pd.read_csv("Customers.csv")
products_df = pd.read_csv("Products.csv")
transactions_df = pd.read_csv("Transactions.csv", parse_dates=['TransactionDate'])

# Display the first few rows of each dataset
print(customers_df.head())
print(products_df.head())
print(transactions_df.head())

  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID     TransactionDate  Quantity  \
0        T00001      C0199      P067 2024-08-25 12:38:23         1   
1        T00112      C0146      P067 2024-05-27 22:23:54         1   
2        T00166      C0127      P067 2024-04-25 07:38:55         1   
3        T00

In [3]:
# Merge Transactions with Products
transactions_products = pd.merge(transactions_df, products_df, on='ProductID')

# Merge with Customers
combined_df = pd.merge(transactions_products, customers_df, on='CustomerID')

# Display the combined dataset
combined_df.head()


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00761,C0199,P022,2024-10-01 05:57:09,4,550.16,137.54,HomeSense Wall Art,Home Decor,137.54,Andrea Jenkins,Europe,2022-12-03
2,T00626,C0199,P079,2024-08-17 12:06:08,2,834.74,417.37,ActiveWear Rug,Home Decor,417.37,Andrea Jenkins,Europe,2022-12-03
3,T00963,C0199,P008,2024-10-26 00:01:58,2,293.7,146.85,BookWorld Bluetooth Speaker,Electronics,146.85,Andrea Jenkins,Europe,2022-12-03
4,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04


In [4]:
# Create features for each customer
customer_features = combined_df.groupby('CustomerID').agg(
    TotalSpending=('TotalValue', 'sum'),
    TransactionCount=('TransactionID', 'count'),
    AvgTransactionValue=('TotalValue', 'mean'),
    FavoriteCategory=('Category', lambda x: x.mode()[0])  # Most frequent category
).reset_index()

# Encode 'FavoriteCategory' into numerical values
customer_features = pd.get_dummies(customer_features, columns=['FavoriteCategory'])

# Scale the features for similarity calculation
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(customer_features.drop('CustomerID', axis=1))

# Display the customer features
customer_features.head()


Unnamed: 0,CustomerID,TotalSpending,TransactionCount,AvgTransactionValue,FavoriteCategory_Books,FavoriteCategory_Clothing,FavoriteCategory_Electronics,FavoriteCategory_Home Decor
0,C0001,3354.52,5,670.904,0,0,1,0
1,C0002,1862.74,4,465.685,0,1,0,0
2,C0003,2725.38,4,681.345,0,0,0,1
3,C0004,5354.88,8,669.36,1,0,0,0
4,C0005,2034.24,3,678.08,0,0,1,0


In [5]:
# Compute cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

# Create a DataFrame for similarity scores
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])
similarity_df.head()


CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.0,0.237587,0.297523,0.386236,0.982007,0.369685,0.976909,0.353,0.147746,0.2261,...,0.306514,0.98899,0.314127,0.3412,0.349808,0.39364,0.981464,0.164769,0.986803,0.38035
C0002,0.237587,1.0,0.216776,0.285054,0.190187,0.265179,0.218185,0.268164,0.985878,0.999607,...,0.225586,0.186913,0.227983,0.253851,0.256892,0.281017,0.184019,0.980417,0.182163,0.942483
C0003,0.297523,0.216776,1.0,0.34129,0.25126,0.353112,0.293457,0.893175,0.133818,0.205083,...,0.279046,0.233008,0.29527,0.300805,0.984457,0.953428,0.242022,0.160532,0.225886,0.355463
C0004,0.386236,0.285054,0.34129,1.0,0.286862,0.929972,0.324148,0.47865,0.179485,0.274117,...,0.970782,0.300127,0.947901,0.993483,0.427416,0.427121,0.278593,0.173448,0.293529,0.436849
C0005,0.982007,0.190187,0.25126,0.286862,1.0,0.326908,0.993513,0.2366,0.116403,0.178616,...,0.244009,0.98981,0.268421,0.252182,0.273617,0.352832,0.999717,0.152108,0.987846,0.32078


In [6]:
# Function to get top 3 similar customers
def get_top_similar(customers, similarity_df, top_n=3):
    recommendations = {}
    for customer in customers:
        similar_customers = similarity_df[customer].sort_values(ascending=False)[1:top_n+1]
        recommendations[customer] = [(idx, score) for idx, score in similar_customers.items()]
    return recommendations

# Get recommendations for customers C0001 - C0020
top_20_customers = customer_features['CustomerID'][:20]
recommendations = get_top_similar(top_20_customers, similarity_df)

# Convert recommendations into a DataFrame
recommendations_df = pd.DataFrame([
    {"CustomerID": cust, "Lookalikes": rec}
    for cust, rec in recommendations.items()
])

# Save the recommendations to a CSV file
recommendations_df.to_csv("Lookalike.csv", index=False)

# Display the recommendations DataFrame
recommendations_df.head()


Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[(C0072, 0.9987768952588381), (C0190, 0.998657..."
1,C0002,"[(C0029, 0.9998906529958622), (C0010, 0.999606..."
2,C0003,"[(C0178, 0.9999961360887427), (C0052, 0.999174..."
3,C0004,"[(C0021, 0.9999844684456922), (C0101, 0.999860..."
4,C0005,"[(C0112, 0.9998592708958555), (C0197, 0.999717..."
