## Importing all the dependencies

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

## Load data


In [8]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')


## Preprocess dates


In [9]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

## Determine max transaction date

In [10]:

max_transaction_date = transactions['TransactionDate'].max()

# Calculate Tenure (days from signup to last transaction date)
customers['Tenure'] = (max_transaction_date - customers['SignupDate']).dt.days

# One-hot encode Region
customers = pd.get_dummies(customers, columns=['Region'], prefix='Region')

# Merge transactions with product categories
transactions = pd.merge(transactions, products[['ProductID', 'Category']], on='ProductID', how='left')

# Prepare list of all customer IDs
all_customers = customers['CustomerID'].unique()

##  RFM Features

In [11]:


rfm = transactions.groupby('CustomerID').agg(
    Frequency=('TransactionID', 'count'),
    Monetary=('TotalValue', 'sum'),
    Last_Transaction=('TransactionDate', 'max')
).reset_index()
rfm['Recency'] = (max_transaction_date - rfm['Last_Transaction']).dt.days
rfm = rfm[['CustomerID', 'Frequency', 'Monetary', 'Recency']]

# Merge RFM with all customers (including those with no transactions)
rfm_all = pd.DataFrame({'CustomerID': all_customers})
rfm_all = rfm_all.merge(rfm, on='CustomerID', how='left').fillna({'Frequency': 0, 'Monetary': 0, 'Recency': 9999})


## Saving our Model 

In [12]:

# Category Preferences
category_counts = transactions.groupby(['CustomerID', 'Category']).size().unstack(fill_value=0).reset_index()

# Merging Categories with all customers
categories_all = pd.DataFrame({'CustomerID': all_customers})
categories_all = categories_all.merge(category_counts, on='CustomerID', how='left').fillna(0)

# Averaging the Price and Quantity
avg_features = transactions.groupby('CustomerID').agg(
    Avg_Price=('Price', 'mean'),
    Avg_Quantity=('Quantity', 'mean')
).reset_index().fillna(0)

# Merging all features
final_features = customers.merge(rfm_all, on='CustomerID') \
                          .merge(categories_all, on='CustomerID') \
                          .merge(avg_features, on='CustomerID', how='left') \
                          .fillna(0)

# Droping non-feature columns
final_features.drop(['CustomerName', 'SignupDate'], axis=1, inplace=True)

# Scaling numerical features
numerical_cols = ['Tenure', 'Frequency', 'Monetary', 'Recency'] + \
                 [col for col in final_features.columns if col.startswith('Category_')] + \
                 ['Avg_Price', 'Avg_Quantity']
scaler = StandardScaler()
final_features[numerical_cols] = scaler.fit_transform(final_features[numerical_cols])

# Computing similarity matrix
customer_ids = final_features['CustomerID'].tolist()
features = final_features.drop('CustomerID', axis=1)
similarity_matrix = cosine_similarity(features)

lookalike_map = {}
target_customers = customer_ids[:20]  # First 20 customers

for cust_id in target_customers:
    idx = customer_ids.index(cust_id)
    similarities = similarity_matrix[idx]
    sorted_indices = np.argsort(similarities)[::-1]
    top_indices = [i for i in sorted_indices if customer_ids[i] != cust_id][:3]
    top_similar = [[customer_ids[i], float(similarities[i])] for i in top_indices]
    lookalike_map[cust_id] = top_similar

# Convert to DataFrame and save
lookalike_df = pd.DataFrame({
    'CustomerID': lookalike_map.keys(),
    'Lookalikes': lookalike_map.values()
})

# Formating  the "Lookalikes" column as a string of lists
lookalike_df['Lookalikes'] = lookalike_df['Lookalikes'].apply(
    lambda x: str([[cust, round(score, 4)] for cust, score in x])
)

lookalike_df.to_csv('Lookalikee.csv', index=False)