In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##  Import necessary libraries

In [17]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder

## Load Dataset

In [18]:
customers=pd.read_csv("/content/drive/MyDrive/Assig/Customers.csv")

In [19]:
products = pd.read_csv('/content/drive/MyDrive/Assig/Products.csv')

In [20]:
transactions = pd.read_csv('/content/drive/MyDrive/Assig/Transactions.csv')

# Data Preprocessing

In [21]:
# Merge 'transactions' with 'products' dataframe to include the product category
transactions = transactions.merge(products[['ProductID', 'Category']], on='ProductID', how='left')

In [22]:
# One-Hot Encoding of Product Categories
category_dummies = pd.get_dummies(transactions['Category'])
transactions = pd.concat([transactions, category_dummies], axis=1)

In [23]:
# Aggregate transaction features for each customer
# Sum of the one-hot encoded product category values by customer to create a feature vector
customer_profiles = transactions.groupby('CustomerID').agg({
    cat: 'sum' for cat in category_dummies.columns
}).reset_index()

In [24]:
# Normalize customer profiles by scaling values
customer_profiles_normalized = customer_profiles.iloc[:, 1:].apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=0)

# Calculate similarity scores

In [25]:
# Calculate similarity scores (cosine similarity)
cosine_sim = cosine_similarity(customer_profiles_normalized)

In [26]:
# Recommend top 3 lookalikes for each customer (C0001 - C0020)
lookalike_dict = {}

for customer_id in customers['CustomerID'][:20]:  # for customers C0001 to C0020
    customer_index = customers[customers['CustomerID'] == customer_id].index[0]

    # Get similarity scores with other customers
    similarity_scores = list(enumerate(cosine_sim[customer_index]))

    # Sort customers by similarity score (descending) and exclude the customer itself
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get top 3 lookalikes (excluding the first one as it's the customer itself)
    top_lookalikes = sorted_scores[1:4]

    # Store the result in the dictionary
    lookalike_dict[customer_id] = [(customers.iloc[i[0]]['CustomerID'], i[1]) for i in top_lookalikes]


# Save Lookalike Recommendations to CSV

In [27]:
# Create the Lookalike.csv file
lookalike_df = pd.DataFrame(columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

for customer_id, lookalikes in lookalike_dict.items():
    for lookalike, score in lookalikes:
        # Create a temporary DataFrame for the new row
        new_row = pd.DataFrame({'CustomerID': [customer_id], 'LookalikeCustomerID': [lookalike], 'SimilarityScore': [score]})
        # Concatenate the new row with the existing DataFrame
        lookalike_df = pd.concat([lookalike_df, new_row], ignore_index=True)

# Save the lookalikes to a CSV file
lookalike_df.to_csv('Neha_Lookalike.csv', index=False)

print("Lookalike model completed and saved to Lookalike.csv.")

Lookalike model completed and saved to Lookalike.csv.


  lookalike_df = pd.concat([lookalike_df, new_row], ignore_index=True)
