In [11]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler



In [12]:
# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")

In [16]:
# Load data
customers_path = 'Customers.csv'
products_path = 'Products.csv'

In [17]:
try:
    customers_data = pd.read_csv(customers_path)
    products_data = pd.read_csv(products_path)
except FileNotFoundError as e:
    print(f"Error: {e}")
    exit()

In [18]:
# Generate a dummy transaction matrix (Customer x Product)
dummy_transactions = {
    'CustomerID': ['C0001', 'C0002', 'C0003', 'C0004', 'C0005'],
    'P001': [1, 0, 1, 0, 1],
    'P002': [0, 1, 0, 1, 0],
    'P003': [1, 1, 1, 0, 0],
    'P004': [0, 0, 1, 1, 0],
    'P005': [1, 0, 0, 1, 1]
}
transaction_data = pd.DataFrame(dummy_transactions)

In [19]:
# Merge customer and transaction data
try:
    if 'CustomerID' not in customers_data.columns or 'CustomerID' not in transaction_data.columns:
        raise KeyError("'CustomerID' column is missing in one of the datasets.")
    customer_transactions = pd.merge(customers_data, transaction_data, on='CustomerID', how='inner')
except KeyError as e:
    print(f"Error: {e}. Ensure that 'CustomerID' exists in both datasets.")
    exit()

In [21]:
# Preprocessing
# One-hot encode categorical variables (Region)
from sklearn.preprocessing import OneHotEncoder
try:
    if 'Region' not in customer_transactions.columns:
        raise KeyError("'Region' column is missing in the dataset.")
    encoder = OneHotEncoder(sparse_output=False)
    region_encoded = encoder.fit_transform(customer_transactions[['Region']])
    region_df = pd.DataFrame(region_encoded, columns=encoder.get_feature_names_out(['Region']))
except KeyError as e:
    print(f"Error: {e}. Ensure that 'Region' exists in the dataset.")
    exit()


In [22]:
# Combine numerical and categorical features
try:
    combined_data = pd.concat([region_df, customer_transactions.iloc[:, 4:]], axis=1)
    if combined_data.isnull().any().any():
        raise ValueError("Combined data contains NaN or infinite values.")
except ValueError as e:
    print(f"Error: {e}")
    exit()

In [23]:
# Normalize the data
try:
    scaler = StandardScaler()
    normalized_data = scaler.fit_transform(combined_data)
except Exception as e:
    print(f"Error during normalization: {e}")
    exit()

In [24]:
# Compute similarity using cosine similarity
try:
    similarity_matrix = cosine_similarity(normalized_data)
except Exception as e:
    print(f"Error during similarity calculation: {e}")
    exit()

In [25]:
# Find top 3 lookalike customers for each customer
lookalike_results = {}
customer_ids = customer_transactions['CustomerID'].tolist()

for idx, customer_id in enumerate(customer_ids):
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_3 = [(customer_ids[i], round(score, 2)) for i, score in sorted_scores[1:4]]  # Exclude self (index 0)
    lookalike_results[customer_id] = top_3

In [27]:
# Save results to Lookalike.csv
try:
    lookalike_df = pd.DataFrame({
        'CustomerID': lookalike_results.keys(),
        'Lookalikes': [str(v) for v in lookalike_results.values()]
    })
    lookalike_df.to_csv('Lookalike.csv', index=False)
    print("Lookalike Model executed successfully. Results saved to Lookalike.csv.")
except Exception as e:
    print(f"Error saving results: {e}")

Lookalike Model executed successfully. Results saved to Lookalike.csv.
