In [9]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
import numpy as np

In [10]:
customers_path = './data/Customers.csv'
products_path = './data/Products.csv'
transactions_path = './data/Transactions.csv'
customers = pd.read_csv(customers_path)
products = pd.read_csv(products_path)
transactions = pd.read_csv(transactions_path)

In [11]:
data = transactions.merge(products, on='ProductID', how='left')
data = data.merge(customers, on='CustomerID', how='left')
data['transaction_date'] = pd.to_datetime(data['TransactionDate'])
data['total_value'] = data['TotalValue']
data['transaction_id'] = data['TransactionID']

In [12]:
customer_product_matrix = data.pivot_table(index='CustomerID', columns='ProductID', values='Quantity', fill_value=0)
collaborative_similarity = cosine_similarity(customer_product_matrix)
collab_similarity_df = pd.DataFrame(collaborative_similarity, index=customer_product_matrix.index, columns=customer_product_matrix.index)

In [13]:
customers['profile'] = customers['Region'] + ' ' + customers['SignupDate'].astype(str)
vectorizer = CountVectorizer()
profile_matrix = vectorizer.fit_transform(customers['profile'])
content_similarity = cosine_similarity(profile_matrix)
content_similarity_df = pd.DataFrame(content_similarity, index=customers['CustomerID'], columns=customers['CustomerID'])

In [14]:
collab_weight = 0.7
content_weight = 0.3
ensemble_similarity = (collab_weight * collab_similarity_df + content_weight * content_similarity_df)

In [15]:
def get_top_n_lookalikes(customer_id, similarity_df, n=3):
    scores = similarity_df[customer_id].sort_values(ascending=False).iloc[1:n+1]
    return list(zip(scores.index, scores.values))

lookalike_results = {}
for customer_id in ensemble_similarity.index[:20]: 
    lookalike_results[customer_id] = get_top_n_lookalikes(customer_id, ensemble_similarity)

In [16]:
lookalike_output = []
for cust_id, lookalikes in lookalike_results.items():
    cust_transactions = data[data['CustomerID'] == cust_id][['transaction_date', 'transaction_id', 'total_value']].drop_duplicates()
    for similar_cust_id, score in lookalikes:
        lookalike_output.append({
            'cust_id': cust_id,
            'similar_cust_id': similar_cust_id,
            'score': score,
            'transaction_date': cust_transactions['transaction_date'].max() if not cust_transactions.empty else None,
            'transaction_id': cust_transactions['transaction_id'].max() if not cust_transactions.empty else None,
            'total_value': cust_transactions['total_value'].sum() if not cust_transactions.empty else 0
        })

lookalike_df = pd.DataFrame(lookalike_output)
lookalike_df.to_csv('Jins_Varghese_Lookalike.csv', index=False)

print("Lookalike Model completed. Results saved to Jins_Varghese_Lookalike.csv")


Lookalike Model completed. Results saved to Jins_Varghese_Lookalike.csv
