In [18]:
import pandas as pd

customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

print("Customers Data:")
print(customers.head())

print("\nProducts Data:")
print(products.head())

print("\nTransactions Data:")
print(transactions.head())

Customers Data:
  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  10-07-2022
1      C0002      Elizabeth Lutz           Asia  13-02-2022
2      C0003      Michael Rivera  South America  07-03-2024
3      C0004  Kathleen Rodriguez  South America  09-10-2022
4      C0005         Laura Weber           Asia  15-08-2022

Products Data:
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31

Transactions Data:
  TransactionID CustomerID ProductID   TransactionDate  Quantity  TotalValue  \
0        T00001      C0199      P067  25-08-2024 12:38         1      300.68   
1        T00112      C0146      P067  27-05-2024 22:23         1      300.68   


In [19]:
customer_transactions = pd.merge(customers, transactions, on='CustomerID')
customer_transactions = pd.merge(customer_transactions, products, on='ProductID')

print(customer_transactions.head())

  CustomerID      CustomerName         Region  SignupDate TransactionID  \
0      C0001  Lawrence Carroll  South America  10-07-2022        T00015   
1      C0001  Lawrence Carroll  South America  10-07-2022        T00932   
2      C0001  Lawrence Carroll  South America  10-07-2022        T00085   
3      C0001  Lawrence Carroll  South America  10-07-2022        T00445   
4      C0001  Lawrence Carroll  South America  10-07-2022        T00436   

  ProductID   TransactionDate  Quantity  TotalValue  Price_x  \
0      P054  19-01-2024 03:12         2      114.60    57.30   
1      P022  17-09-2024 09:01         3      412.62   137.54   
2      P096  08-04-2024 00:01         2      614.94   307.47   
3      P083  07-05-2024 03:11         2      911.44   455.72   
4      P029  02-11-2024 17:04         3     1300.92   433.64   

             ProductName     Category  Price_y  
0     SoundWave Cookbook        Books    57.30  
1     HomeSense Wall Art   Home Decor   137.54  
2   SoundWave Hea

In [20]:
customer_features = customer_transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'mean',
    'Price_y': 'mean',
    'Category': lambda x:x.mode()[0]
}).reset_index()

print(customer_features.head())

  CustomerID  TotalValue  Quantity     Price_y     Category
0      C0001     3354.52  2.400000  278.334000  Electronics
1      C0002     1862.74  2.500000  208.920000     Clothing
2      C0003     2725.38  3.500000  195.707500   Home Decor
3      C0004     5354.88  2.875000  240.636250        Books
4      C0005     2034.24  2.333333  291.603333  Electronics


In [22]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
customer_features[['TotalValue', 'Quantity', 'Price_y']] = scaler.fit_transform(customer_features[['TotalValue', 'Quantity', 'Price_y']])
print(customer_features.head())

  CustomerID  TotalValue  Quantity   Price_y     Category
0      C0001    0.308942  0.466667  0.519414  Electronics
1      C0002    0.168095  0.500000  0.367384     Clothing
2      C0003    0.249541  0.833333  0.338446   Home Decor
3      C0004    0.497806  0.625000  0.436848        Books
4      C0005    0.184287  0.444444  0.548476  Electronics


In [23]:
customer_features = pd.get_dummies(customer_features, columns=['Category'], drop_first=True)
print(customer_features.head())

  CustomerID  TotalValue  Quantity   Price_y  Category_Clothing  \
0      C0001    0.308942  0.466667  0.519414              False   
1      C0002    0.168095  0.500000  0.367384               True   
2      C0003    0.249541  0.833333  0.338446              False   
3      C0004    0.497806  0.625000  0.436848              False   
4      C0005    0.184287  0.444444  0.548476              False   

   Category_Electronics  Category_Home Decor  
0                  True                False  
1                 False                False  
2                 False                 True  
3                 False                False  
4                  True                False  


In [24]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(customer_features.iloc[:, 1:])
print(similarity_matrix)

[[1.         0.31830244 0.37288421 ... 0.2359945  0.9938562  0.427406  ]
 [0.31830244 1.         0.35847614 ... 0.95568592 0.28767074 0.96831889]
 [0.37288421 0.35847614 1.         ... 0.20229317 0.33612445 0.46541912]
 ...
 [0.2359945  0.95568592 0.20229317 ... 1.         0.21521258 0.88556006]
 [0.9938562  0.28767074 0.33612445 ... 0.21521258 1.         0.37671551]
 [0.427406   0.96831889 0.46541912 ... 0.88556006 0.37671551 1.        ]]


In [26]:
def get_top_similar_customers(customer_id, similarity_matrix, top_n=3):
    customer_index = customer_features[customer_features['CustomerID'] == customer_id].index[0]
    similarities = similarity_matrix[customer_index]
    top_indices = similarities.argsort()[-top_n-1:-1][::-1]  # Get top N similar customers (excluding self)
    return [(customer_features.iloc[i]['CustomerID'], similarities[i]) for i in top_indices]

lookalike_results = {}
for customer_id in customer_features['CustomerID'][:20]:
    lookalike_results[customer_id] = get_top_similar_customers(customer_id, similarity_matrix)
print(lookalike_results)

{'C0001': [('C0181', 0.9993909765550407), ('C0055', 0.996288368827149), ('C0035', 0.9954509356368294)], 'C0002': [('C0029', 0.9999803544922413), ('C0062', 0.9989387957445973), ('C0030', 0.9982676525966139)], 'C0003': [('C0136', 0.994591084822027), ('C0110', 0.9873039005963182), ('C0160', 0.9847926355091003)], 'C0004': [('C0017', 0.9996588486600123), ('C0087', 0.9988067874939532), ('C0041', 0.9986806572351384)], 'C0005': [('C0140', 0.9989940830900188), ('C0186', 0.998850936632242), ('C0199', 0.9977707437669561)], 'C0006': [('C0174', 0.9994849097311632), ('C0011', 0.9991525347051278), ('C0117', 0.9984388707207734)], 'C0007': [('C0115', 0.9981914167701215), ('C0050', 0.997830965761554), ('C0186', 0.9955086053936825)], 'C0008': [('C0152', 0.9954612312544835), ('C0179', 0.9940032344595229), ('C0164', 0.9922022810136649)], 'C0009': [('C0058', 0.9882490052876706), ('C0103', 0.985788371291185), ('C0198', 0.9832836135103465)], 'C0010': [('C0111', 0.9972569987802165), ('C0134', 0.996800711318034

In [27]:
import csv

with open('Lookalike.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['CustomerID', 'LookalikeID', 'SimilarityScore'])

    for customer_id, lookalikes in lookalike_results.items():
        for lookalike_id, score in lookalikes:
            writer.writerow([customer_id, lookalike_id, score])

print("Lookalike.csv file created successfully!")

Lookalike.csv file created successfully!
