In [1]:
#Load Data
import pandas as pd

customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [3]:
#Cleaning data
#Check for missing values
print(customers.isnull().sum())
print(products.isnull().sum())
print(transactions.isnull().sum())
#handle Duplicates and inconsistent data
customers.drop_duplicates(inplace=True)
products.drop_duplicates(inplace=True)
transactions.drop_duplicates(inplace=True)



CustomerID      0
CustomerName    0
Region          0
SignupDate      0
dtype: int64
ProductID      0
ProductName    0
Category       0
Price          0
dtype: int64
TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price              0
dtype: int64


In [4]:
#Combine datasets
# Merge Transactions.csv and Customers.csv on CustomerID.
# Merge the result with Products.csv on ProductID.
customer_transactions = pd.merge(transactions, customers, on='CustomerID')
customer_transactions = pd.merge(customer_transactions, products, on='ProductID')


In [5]:
#Select Features
# Use relevant features like Region, TotalValue, Category, etc., to describe customer profiles.
# Optionally, encode categorical variables (e.g., Region, Category) using one-hot encoding
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
encoded_region = encoder.fit_transform(customer_transactions[['Region']]).toarray()
customer_transactions = pd.concat(
    [customer_transactions, pd.DataFrame(encoded_region, columns=encoder.categories_[0])], axis=1
)


In [6]:
#Feature Aggregation
# Aggregate data by CustomerID to create a single row per customer. For example:
# Total spending (TotalValue).
# Number of unique categories purchased.
# Average quantity per transaction.
customer_features = customer_transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Category': 'nunique',
    'Quantity': 'mean'
}).reset_index()


In [7]:
#Compute Similarity
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features.drop(columns=['CustomerID']))
#Calculate PairWise Similarity
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(normalized_features)


In [13]:
# Filter the first 20 customers
first_20_customers = customer_features.iloc[:20]
# Get Reommendations for first 20 customers
def get_top_n_similarities_for_subset(similarity_matrix, customer_subset, n=3):
    recommendations = {}
    for i in range(len(customer_subset)):
        similar_indices = np.argsort(similarity_matrix[i])[::-1][1:n+1]
        recommendations[customer_subset['CustomerID'].iloc[i]] = [
            (customer_subset['CustomerID'].iloc[j], similarity_matrix[i][j]) for j in similar_indices
        ]
    return recommendations

# Generate similarity matrix for the first 20 customers
subset_features = normalized_features[:20]
subset_similarity_matrix = cosine_similarity(subset_features)
recommendations = get_top_n_similarities_for_subset(subset_similarity_matrix, first_20_customers)



In [15]:
# Save the Recommendations to CSV

with open('VijayBabu_Karumanchi_Lookalike.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['CustomerID', 'Recommendations'])
    for customer_id, recs in recommendations.items():
        writer.writerow([customer_id, recs])
