In [4]:
import pandas as pd 

customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [5]:
# Merge the datasets
merged_data = transactions.merge(customers, on='CustomerID', how='left')
merged_data = merged_data.merge(products, on='ProductID', how='left')

# Inspect the merged dataset
print("\nMerged Data Sample:")
print(merged_data.head())
print(merged_data.info())


Merged Data Sample:
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiv

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import csv

# Prepare customer-level features for similarity calculation
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'ProductID': 'nunique'
}).reset_index()

# Merge with customer profiles
customer_features = customer_features.merge(customers, on='CustomerID', how='left')

In [8]:
# Standardize numerical data
scaler = StandardScaler()
customer_features[['TotalValue', 'Quantity', 'ProductID']] = scaler.fit_transform(
    customer_features[['TotalValue', 'Quantity', 'ProductID']]
)

In [9]:
# Calculate cosine similarity
similarity_matrix = cosine_similarity(customer_features[['TotalValue', 'Quantity', 'ProductID']])
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

In [10]:
# Generate top 3 similar customers for each target customer
lookalike_results = {}
for customer in customer_features['CustomerID'][:20]:  # First 20 customers
    similar_customers = similarity_df[customer].sort_values(ascending=False).iloc[1:4]
    lookalike_results[customer] = similar_customers.index.tolist()

In [13]:
# Save recommendations to Lookalike.csv
with open('KaparotuVenkataSurya_Tharani_Lookalike.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['CustomerID', 'SimilarCustomers'])
    for key, value in lookalike_results.items():
        writer.writerow([key, value])
    print("lookalike file has been created")

lookalike file has been created
