In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

In [3]:
# Feature Engineering
def create_features(customers, transactions, products):
    # Merge data
    data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

    # Aggregate features
    features = data.groupby('CustomerID').agg({
        'TotalValue': ['mean', 'sum'],
        'ProductID': 'count',
        'Category': lambda x: x.mode()[0]
    }).reset_index()

    features.columns = ['CustomerID', 'AvgTransactionValue', 'TotalSpend', 'PurchaseCount', 'FavoriteCategory']
    return features

features = create_features(customers, transactions, products)

In [4]:
# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), ['AvgTransactionValue', 'TotalSpend', 'PurchaseCount']),
        ('cat', OneHotEncoder(), ['FavoriteCategory'])
    ])

# Transform features
X = preprocessor.fit_transform(features)

In [5]:
# Calculate similarity
similarity_matrix = cosine_similarity(X)

# Get top 3 lookalikes for each customer
lookalikes = {}
for idx, customer_id in enumerate(features['CustomerID']):
    similar_indices = similarity_matrix[idx].argsort()[-4:-1][::-1]  # Exclude self
    similar_customers = [(features['CustomerID'][i], similarity_matrix[idx][i]) for i in similar_indices]
    lookalikes[customer_id] = similar_customers

In [6]:
lookalike_df = pd.DataFrame({
    'CustomerID': list(lookalikes.keys()),
    'Lookalikes': [str(v) for v in lookalikes.values()]
})

lookalike_df.to_csv('Jahnvi_sahni_Lookalike.csv', index=False)

In [7]:
print(lookalike_df.head(20))

   CustomerID                                         Lookalikes
0       C0001  [('C0072', 0.946105176936384), ('C0190', 0.941...
1       C0002  [('C0029', 0.9993708548979269), ('C0010', 0.99...
2       C0003  [('C0178', 0.999865879448906), ('C0052', 0.975...
3       C0004  [('C0021', 0.9997310950937549), ('C0101', 0.99...
4       C0005  [('C0112', 0.997617843800819), ('C0197', 0.995...
5       C0006  [('C0117', 0.9968489322097402), ('C0168', 0.97...
6       C0007  [('C0120', 0.9949897643552799), ('C0140', 0.97...
7       C0008  [('C0113', 0.9322254762561094), ('C0124', 0.90...
8       C0009  [('C0077', 0.9997899920904881), ('C0083', 0.99...
9       C0010  [('C0029', 0.99957123050574), ('C0002', 0.9979...
10      C0011  [('C0064', 0.9663847037168202), ('C0137', 0.92...
11      C0012  [('C0104', 0.9718143504908033), ('C0059', 0.94...
12      C0013  [('C0143', 0.9999265905268068), ('C0099', 0.98...
13      C0014  [('C0128', 0.9961184019336934), ('C0151', 0.99...
14      C0015  [('C0132',