In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

# Merge datasets
df = transactions.merge(customers, on='CustomerID', how='inner')
df = df.merge(products, on='ProductID', how='inner')

# Ensure correct column selection for the merge
if 'Price_x' in df.columns and 'Price_y' in df.columns:
    df = df.rename(columns={'Price_y': 'Price'}).drop(columns=['Price_x'])

# Feature engineering
df['TotalSpent'] = df['Quantity'] * df['Price']

# Aggregate customer-level features
customer_features = df.groupby('CustomerID').agg({
    'TotalSpent': 'sum',
    'Quantity': 'sum',
    'TransactionID': 'count',
    'Category': lambda x: x.mode().iloc[0] if not x.mode().empty else 'Unknown',
    'Region': 'first'  # Assuming Region is the same for each customer
}).reset_index()

# One-hot encode categorical variables
customer_features = pd.get_dummies(customer_features, columns=['Category', 'Region'])

# Normalize numerical features
scaler = StandardScaler()
numerical_cols = ['TotalSpent', 'Quantity', 'TransactionID']
customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])

# Define function to find lookalike customers
def get_lookalikes(customer_id, n=3):
    # Extract the vector for the input customer
    customer_vector = customer_features[customer_features['CustomerID'] == customer_id].iloc[:, 1:].values
    # Compute cosine similarity with all customers
    similarities = cosine_similarity(customer_vector, customer_features.iloc[:, 1:].values)
    # Sort and get top N most similar customers
    similar_indices = np.argsort(similarities[0])[::-1][1:n+1]  # Exclude self (index 0)
    similar_customers = customer_features.iloc[similar_indices]
    similar_customers['Similarity'] = similarities[0][similar_indices]
    return similar_customers[['CustomerID', 'Similarity']]

# Create lookalike dictionary for the first 20 customers
lookalikes = {}
for cust_id in customers['CustomerID'][:20]:
    lookalikes[cust_id] = get_lookalikes(cust_id).to_dict('records')

# Save lookalikes to a CSV file
lookalike_df = pd.DataFrame([(k, str(v)) for k, v in lookalikes.items()], 
                            columns=['CustomerID', 'Lookalikes'])
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike analysis completed and saved to 'Lookalike.csv'.")



Lookalike analysis completed and saved to 'Lookalike.csv'.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_customers['Similarity'] = similarities[0][similar_indices]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_customers['Similarity'] = similarities[0][similar_indices]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_customers['Similarity'] = similarities[0][similar_indices]
A va