Libraries and Dataset

In [5]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')


Data Preprocessing

In [6]:
# Calculating total spending per customer
customer_spending = transactions.groupby('CustomerID')['TotalValue'].sum().reset_index()
customer_spending.rename(columns={'TotalValue': 'TotalSpending'}, inplace=True)

# Calculating average spending per customer
avg_spending = transactions.groupby('CustomerID')['TotalValue'].mean().reset_index()
avg_spending.rename(columns={'TotalValue': 'AvgSpending'}, inplace=True)

# Calculating the number of transactions per customer
transaction_count = transactions.groupby('CustomerID')['TransactionID'].count().reset_index()
transaction_count.rename(columns={'TransactionID': 'TransactionCount'}, inplace=True)

# Merge features with customer data
customer_features = customers.merge(customer_spending, on='CustomerID', how='left')
customer_features = customer_features.merge(avg_spending, on='CustomerID', how='left')
customer_features = customer_features.merge(transaction_count, on='CustomerID', how='left')

# Handle missing data
customer_features.fillna(0, inplace=True)

# Data Preprocessing: Drop non-numeric columns and ensure numeric data
feature_matrix = customer_features.set_index('CustomerID').drop(['CustomerName', 'SignupDate'], axis=1)

# Ensure all columns are numeric, converting non-numeric to NaN and then filling NaN with 0
feature_matrix = feature_matrix.apply(pd.to_numeric, errors='coerce')
feature_matrix.fillna(0, inplace=True)



Similarity calculation

In [7]:
similarity_matrix = cosine_similarity(feature_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=feature_matrix.index, columns=feature_matrix.index)

Lookalike Recommendations

In [8]:
# Lookalike Recommendations for Customers C0001 - C0020
lookalike_recommendations = {}

for customer_id in feature_matrix.index[:20]:
    # similarity scores for the customer
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]  # Exclude self
    recommendations = [(other_customer, score) for other_customer, score in similar_customers.items()]
    lookalike_recommendations[customer_id] = recommendations

get CSV

In [9]:
# Convert recommendations into the required format and save as CSV
lookalike_data = []

for customer_id, recommendations in lookalike_recommendations.items():
    for other_customer, score in recommendations:
        lookalike_data.append({
            'CustomerID': customer_id,
            'LookalikeCustomerID': other_customer,
            'SimilarityScore': score
        })

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv('Ketan_Paryani_Lookalike.csv', index=False)

Display the first few rows of recommendations

In [10]:

print(lookalike_df.head())

  CustomerID LookalikeCustomerID  SimilarityScore
0      C0001               C0137              1.0
1      C0001               C0152              1.0
2      C0001               C0056              1.0
3      C0002               C0029              1.0
4      C0002               C0199              1.0
