In [21]:
import pandas as pd
import numpy as np

In [22]:
# Load the datasets from CSV files
df_customers = pd.read_csv("https://raw.githubusercontent.com/i183x/zeo/refs/heads/main/Customers.csv")
df_products = pd.read_csv("https://raw.githubusercontent.com/i183x/zeo/refs/heads/main/Products.csv")
df_transactions = pd.read_csv("https://raw.githubusercontent.com/i183x/zeo/refs/heads/main/Transactions.csv")


In [23]:
df_transactions = df_transactions.drop('Price', axis=1) #it is duplicate
# Merge the datasets to have customer, product, and transaction data together
df = df_transactions.merge(df_customers, on='CustomerID', how='left')
df = df.merge(df_products, on='ProductID', how='left')

In [25]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import json

# Preprocessing: Combine relevant features into a single representation for each customer
def preprocess_customer_data(df):
    # Combine product history for each customer
    df['TransactionHistory'] = df.groupby('CustomerID')['ProductName'].transform(lambda x: ' '.join(x))

    # Aggregate numerical features
    df['TotalSpent'] = df.groupby('CustomerID')['TotalValue'].transform('sum')
    df['TotalTransactions'] = df.groupby('CustomerID')['TransactionID'].transform('nunique')

    # Ensure each customer has unique rows
    customer_data = df[['CustomerID', 'Region', 'SignupDate', 'TransactionHistory', 'TotalSpent', 'TotalTransactions']]
    customer_data = customer_data.drop_duplicates(subset=['CustomerID'])

    # Normalize TotalSpent and TotalTransactions
    scaler = StandardScaler()
    customer_data[['TotalSpent', 'TotalTransactions']] = scaler.fit_transform(customer_data[['TotalSpent', 'TotalTransactions']])

    return customer_data

# Function to get top N similar customers for a given customer ID
def get_top_n_similar(customer_id, similarity_matrix, customer_ids, n=3):
    index = customer_ids.index(customer_id)
    similarity_scores = list(enumerate(similarity_matrix[index]))
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Exclude the customer itself and return the top N
    top_n = [(customer_ids[i], score) for i, score in sorted_scores if customer_ids[i] != customer_id][:n]
    return top_n

# --- Main Execution ---
# Assuming `df` is the input DataFrame with customer data (defined elsewhere)

# Step 1: Preprocess the customer data
customer_data = preprocess_customer_data(df)

# Step 2: Combine all text-based features into one for vectorization
customer_data['CombinedFeatures'] = (
    customer_data['Region'] + ' ' +
    customer_data['TransactionHistory'].fillna('')
)

# Step 3: TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  # max_features limits the number of terms used
feature_matrix = vectorizer.fit_transform(customer_data['CombinedFeatures'])

# Step 4: Include TotalSpent and TotalTransactions (numerical features) into the feature matrix
numerical_features = customer_data[['TotalSpent', 'TotalTransactions']].values
numerical_feature_matrix = np.array(numerical_features)

# Step 5: Combine text-based and numerical features (stack them horizontally)
final_feature_matrix = np.hstack([feature_matrix.toarray(), numerical_feature_matrix])

# Step 6: Calculate cosine similarity among all customers
similarity_matrix = cosine_similarity(final_feature_matrix)

# Step 7: Get the top 3 similar customers for the first 20 customers
customer_ids = customer_data['CustomerID'].tolist()
lookalike_map = {}

# Iterate over first 20 customers (C0001 to C0020)
for customer_id in customer_ids[:20]:
    top_similar = get_top_n_similar(customer_id, similarity_matrix, customer_ids, n=3)
    # Convert to the desired format: [{SimilarCustomerID: SimilarityScore}, ...]
    lookalike_map[customer_id] = [{similar_id: score} for similar_id, score in top_similar]

# Step 8: Save the results to Lookalike.csv in the required format
output_data = []

for customer_id, similar_list in lookalike_map.items():
    # Format the list correctly as a string
    lookalikes_str = '[' + ', '.join([f'{{{k}: {v}}}' for d in similar_list for k, v in d.items()]) + ']'
    output_data.append({
        'cust_id': customer_id,
        'List': lookalikes_str  # List of dictionaries in string format
    })

# Convert the list of dicts into a DataFrame
lookalike_df = pd.DataFrame(output_data)

# Save to CSV
lookalike_df.to_csv('Krishna_Purwar_Lookalike.csv', index=False)

print("Lookalike Model executed successfully. Results saved to Lookalike.csv.")


Lookalike Model executed successfully. Results saved to Lookalike.csv.
