In [42]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [43]:
customers = pd.read_csv(r"E:\Datasets\Zeotap Task\Customers.csv")
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
products = pd.read_csv(r"E:\Datasets\Zeotap Task\Products.csv")
transactions = pd.read_csv(r"E:\Datasets\Zeotap Task\Transactions.csv")
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

merged_data = transactions.merge(customers, on='CustomerID').merge(products.drop('Price',axis=1), on='ProductID')

In [44]:
latest_date = transactions['TransactionDate'].max()
customers['Tenure'] = (latest_date - customers['SignupDate']).dt.days
customers_encoded = pd.get_dummies(customers, columns=['Region'])
customer_features = customers_encoded[['CustomerID', 'Tenure'] + 
                                      list(customers_encoded.filter(like='Region_'))]

In [45]:
customer_features

Unnamed: 0,CustomerID,Tenure,Region_Asia,Region_Europe,Region_North America,Region_South America
0,C0001,902,False,False,False,True
1,C0002,1049,True,False,False,False
2,C0003,296,False,False,False,True
3,C0004,811,False,False,False,True
4,C0005,866,True,False,False,False
...,...,...,...,...,...,...
195,C0196,935,False,True,False,False
196,C0197,648,False,True,False,False
197,C0198,1035,False,True,False,False
198,C0199,756,False,True,False,False


In [46]:
trans_products = pd.merge(transactions, products, on='ProductID', how='left')
category_dummies = pd.get_dummies(trans_products['Category'], prefix='Category')
trans_products = pd.concat([trans_products['CustomerID'], category_dummies], axis=1)
category_counts = trans_products.groupby('CustomerID').sum().reset_index()


In [47]:
category_counts

Unnamed: 0,CustomerID,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor
0,C0001,1,0,3,1
1,C0002,0,2,0,2
2,C0003,0,1,1,2
3,C0004,3,0,2,3
4,C0005,0,0,2,1
...,...,...,...,...,...
194,C0196,1,1,0,2
195,C0197,0,0,2,1
196,C0198,0,1,1,0
197,C0199,0,0,2,2


In [48]:
customer_trans = merged_data.groupby('CustomerID').agg(
    total_transactions=('TransactionID', 'count'),
    total_quantity=('Quantity', 'sum'),
    total_spent=('TotalValue', 'sum'),
    avg_price=('Price', 'mean'),
    last_transaction_date=('TransactionDate', 'max')
).reset_index()


customer_trans['recency'] = (latest_date - customer_trans['last_transaction_date']).dt.days
customer_trans.drop('last_transaction_date', axis=1, inplace=True)
customer_trans = pd.merge(customer_trans, category_counts, on='CustomerID', how='left')


In [49]:
customer_trans

Unnamed: 0,CustomerID,total_transactions,total_quantity,total_spent,avg_price,recency,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor
0,C0001,5,12,3354.52,278.334000,55,1,0,3,1
1,C0002,4,10,1862.74,208.920000,25,0,2,0,2
2,C0003,4,14,2725.38,195.707500,125,0,1,1,2
3,C0004,8,23,5354.88,240.636250,4,3,0,2,3
4,C0005,3,7,2034.24,291.603333,54,0,0,2,1
...,...,...,...,...,...,...,...,...,...,...
194,C0196,4,12,4982.88,416.992500,13,1,1,0,2
195,C0197,3,9,1928.65,227.056667,0,0,0,2,1
196,C0198,2,3,931.83,239.705000,84,0,1,1,0
197,C0199,4,9,1979.28,250.610000,63,0,0,2,2


In [50]:
combined_features = pd.merge(customer_features, customer_trans, on='CustomerID', how='left').fillna(0)

In [51]:
scaler = StandardScaler()
feature_cols = combined_features.columns.difference(['CustomerID'])
scaled_features = scaler.fit_transform(combined_features[feature_cols])
scaled_df = pd.DataFrame(scaled_features, columns=feature_cols)
scaled_df['CustomerID'] = combined_features['CustomerID']

In [54]:
scaled_df

Unnamed: 0,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor,Region_Asia,Region_Europe,Region_North America,Region_South America,Tenure,avg_price,recency,total_quantity,total_spent,total_transactions,CustomerID
0,-0.314627,-1.036192,1.555406,-0.215318,-0.538816,-0.577350,-0.546536,1.545908,1.152884,0.110366,-0.261657,-0.110735,-0.051884,0.000000,C0001
1,-1.213560,0.781689,-1.141830,0.681841,1.855921,-0.577350,-0.546536,-0.646869,1.605593,-0.854626,-0.685506,-0.434049,-0.862714,-0.451294,C0002
2,-1.213560,-0.127252,-0.242751,0.681841,-0.538816,-0.577350,-0.546536,1.545908,-0.713387,-1.038306,0.727326,0.212579,-0.393842,-0.451294,C0003
3,1.483240,-1.036192,0.656327,1.578999,-0.538816,-0.577350,-0.546536,1.545908,0.872636,-0.413708,-0.982201,1.667493,1.035375,1.353881,C0004
4,-1.213560,-1.036192,0.656327,-0.215318,1.855921,-0.577350,-0.546536,-0.646869,1.042017,0.294836,-0.275785,-0.919021,-0.769499,-0.902587,C0005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,-0.314627,-0.127252,-1.141830,0.681841,-0.538816,1.732051,-0.546536,-0.646869,1.254513,2.037994,-0.855046,-0.110735,0.833181,-0.451294,C0196
196,-1.213560,-1.036192,0.656327,-0.215318,-0.538816,1.732051,-0.546536,-0.646869,0.370652,-0.602491,-1.038714,-0.595706,-0.826890,-0.902587,C0197
197,-1.213560,-0.127252,-0.242751,-1.112477,-0.538816,1.732051,-0.546536,-0.646869,1.562478,-0.426654,0.148065,-1.565649,-1.368694,-1.353881,C0198
198,-1.213560,-1.036192,0.656327,0.681841,-0.538816,1.732051,-0.546536,-0.646869,0.703255,-0.275053,-0.148630,-0.595706,-0.799371,-0.451294,C0199


In [56]:
features_matrix = scaled_df.drop('CustomerID', axis=1).values
features_matrix.shape

(200, 14)

In [57]:
similarity_matrix = cosine_similarity(features_matrix)

In [59]:
similarity_matrix.shape

(200, 200)

In [60]:
similarity_df = pd.DataFrame(similarity_matrix, index=scaled_df['CustomerID'], 
                             columns=scaled_df['CustomerID'])

In [61]:
similarity_df

CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.000000,-0.160357,0.267836,0.466293,0.270910,0.132691,0.255616,-0.060440,-0.173456,-0.354824,...,0.500640,0.680746,-0.199890,-0.206179,0.068049,-0.117568,0.197966,0.080898,0.209774,-0.279308
C0002,-0.160357,1.000000,0.043325,-0.229196,0.607333,-0.302448,0.462176,-0.121776,0.161946,0.353797,...,-0.530905,-0.077291,0.089756,-0.413741,0.001195,0.087457,0.143039,0.337392,0.154781,0.543987
C0003,0.267836,0.043325,1.000000,0.085468,-0.086537,0.328843,-0.114489,-0.031556,-0.043463,-0.028093,...,0.063561,0.368126,-0.264856,-0.169344,0.718734,-0.397271,0.010982,-0.033306,0.066580,-0.324028
C0004,0.466293,-0.229196,0.085468,1.000000,-0.327643,0.183897,-0.349653,0.435091,-0.778297,-0.401999,...,0.354949,-0.109473,-0.116029,0.211769,0.347158,-0.017280,-0.263584,-0.579483,-0.174141,-0.071585
C0005,0.270910,0.607333,-0.086537,-0.327643,1.000000,-0.331802,0.924108,-0.332473,0.236475,-0.128256,...,-0.149189,0.231687,0.208258,-0.411090,-0.349599,0.028663,0.394333,0.425473,0.377619,0.386211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0196,-0.117568,0.087457,-0.397271,-0.017280,0.028663,0.121957,0.124678,-0.188530,0.299804,0.130914,...,-0.296588,-0.228453,-0.165338,-0.372804,-0.164469,1.000000,0.300070,0.315161,0.387496,0.099525
C0197,0.197966,0.143039,0.010982,-0.263584,0.394333,-0.414603,0.213341,-0.218829,0.634467,0.421392,...,-0.064295,0.036752,-0.336772,-0.310580,-0.178196,0.300070,1.000000,0.756278,0.899245,-0.244656
C0198,0.080898,0.337392,-0.033306,-0.579483,0.425473,-0.254899,0.359890,-0.565738,0.848696,0.640119,...,-0.218880,0.383976,-0.001459,-0.558794,-0.440974,0.315161,0.756278,1.000000,0.710274,-0.251181
C0199,0.209774,0.154781,0.066580,-0.174141,0.377619,-0.463743,0.302068,-0.084247,0.555703,0.276627,...,-0.273951,0.053259,-0.303914,-0.265961,-0.138613,0.387496,0.899245,0.710274,1.000000,-0.351402


In [80]:
target_customers = scaled_df['CustomerID'].head(20).values
lookalike_data = []

for target in target_customers:
    sim_scores = similarity_df.loc[target].drop(target).sort_values(ascending=False)[:3]
    row = [target]
    for cust_id, score in sim_scores.items():
        row.extend([cust_id, round(score, 4)])
    lookalike_data.append(row)


lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'Lookalike1', 'Score1', 
                                                    'Lookalike2', 'Score2', 'Lookalike3', 'Score3'])

In [81]:
lookalike_df

Unnamed: 0,CustomerID,Lookalike1,Score1,Lookalike2,Score2,Lookalike3,Score3
0,C0001,C0120,0.783,C0118,0.7828,C0096,0.712
1,C0002,C0159,0.9332,C0134,0.9192,C0106,0.919
2,C0003,C0031,0.8339,C0129,0.7994,C0052,0.7682
3,C0004,C0113,0.9383,C0104,0.8277,C0012,0.7758
4,C0005,C0007,0.9241,C0140,0.9029,C0186,0.8373
5,C0006,C0187,0.8904,C0137,0.743,C0085,0.7118
6,C0007,C0005,0.9241,C0140,0.8304,C0186,0.7862
7,C0008,C0059,0.8013,C0098,0.7953,C0194,0.7817
8,C0009,C0061,0.8633,C0198,0.8487,C0167,0.7903
9,C0010,C0061,0.7656,C0062,0.7306,C0132,0.7158


In [82]:
lookalike_df.to_csv('Ajayraj_SinghRathore_Lookalike.csv', index=False)