In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Convert date columns to datetime
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Merge datasets
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

# Verify column names in the merged dataset
print("Columns in merged_data:", merged_data.columns)

# Feature Engineering: Aggregate transaction data for customers
customer_profiles = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',         # Total spend
    'Quantity': 'sum',           # Total quantity purchased
    'Region': 'first',           # Region
    'Category': lambda x: x.mode()[0],  # Most frequent product category
}).reset_index()

# If 'Price' is necessary, derive it from 'TotalValue' and 'Quantity'
if 'Price' not in customer_profiles.columns and 'TotalValue' in merged_data.columns and 'Quantity' in merged_data.columns:
    customer_profiles['Price'] = customer_profiles['TotalValue'] / customer_profiles['Quantity']

# One-hot encode categorical variables
customer_profiles = pd.get_dummies(customer_profiles, columns=['Region', 'Category'], drop_first=True)

# Normalize numerical features
scaler = StandardScaler()
numerical_columns = ['TotalValue', 'Quantity', 'Price']
customer_profiles[numerical_columns] = scaler.fit_transform(customer_profiles[numerical_columns])

# Similarity Calculation
customer_ids = customer_profiles['CustomerID']
customer_profiles_matrix = customer_profiles.drop('CustomerID', axis=1)

# Calculate cosine similarity
similarity_matrix = cosine_similarity(customer_profiles_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_ids, columns=customer_ids)

# Generate Lookalike Recommendations
lookalike_map = {}
for customer_id in customer_ids[:20]:  # For CustomerID: C0001 to C0020
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]  # Top 3 lookalikes
    lookalike_map[customer_id] = [
        {'cust_id': similar_customer, 'score': round(score, 3)}
        for similar_customer, score in similar_customers.items()
    ]

# Convert lookalike_map to DataFrame
lookalike_df = pd.DataFrame({
    'CustomerID': lookalike_map.keys(),
    'Lookalikes': lookalike_map.values()
})

# Save Lookalike.csv
lookalike_df.to_csv('Lookalike.csv', index=False)

# Print a preview of the lookalike recommendations
print(lookalike_df.head())


Columns in merged_data: Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')
  CustomerID                                         Lookalikes
0      C0001  [{'cust_id': 'C0184', 'score': 0.961}, {'cust_...
1      C0002  [{'cust_id': 'C0088', 'score': 0.986}, {'cust_...
2      C0003  [{'cust_id': 'C0076', 'score': 0.947}, {'cust_...
3      C0004  [{'cust_id': 'C0165', 'score': 0.984}, {'cust_...
4      C0005  [{'cust_id': 'C0186', 'score': 0.946}, {'cust_...
