In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
customers = pd.read_csv('/content/drive/My Drive/Customers.csv')
products = pd.read_csv('/content/drive/My Drive/Products.csv')
transactions = pd.read_csv('/content/drive/My Drive/Transactions.csv')


In [None]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")


In [None]:

customer_profiles = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'ProductID': 'nunique',
    'SignupDate': 'max'
}).reset_index()

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
customer_profiles[['TotalValue', 'Quantity', 'ProductID']] = scaler.fit_transform(customer_profiles[['TotalValue', 'Quantity', 'ProductID']])
print(customer_profiles.head())


  CustomerID  TotalValue  Quantity  ProductID SignupDate
0      C0001    0.308942  0.354839   0.444444 2022-07-10
1      C0002    0.168095  0.290323   0.333333 2022-02-13
2      C0003    0.249541  0.419355   0.333333 2024-03-07
3      C0004    0.497806  0.709677   0.777778 2022-10-09
4      C0005    0.184287  0.193548   0.222222 2022-08-15


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(customer_profiles.drop(['CustomerID', 'SignupDate'], axis=1))

def get_similar_customers(customer_id, N=3):
    customer_idx = customer_profiles[customer_profiles['CustomerID'] == customer_id].index[0]
    similar_indices = similarity_matrix[customer_idx].argsort()[-N-1:-1][::-1]
    similar_customers = [(customer_profiles['CustomerID'][i], similarity_matrix[customer_idx][i]) for i in similar_indices]
    return similar_customers

test_customer_id = 'C0001'
print(f"Top 3 similar customers for {test_customer_id}: {get_similar_customers(test_customer_id)}")


Top 3 similar customers for C0001: [('C0173', 0.9999822241148129), ('C0177', 0.9999136617563172), ('C0122', 0.9998426710884338)]


In [None]:

lookalike_map = {}
for customer_id in customer_profiles['CustomerID'][:20]:
    lookalike_map[customer_id] = get_similar_customers(customer_id)

lookalikes = []
for cust_id, similars in lookalike_map.items():
      for similar in similars:
              lookalikes.append([cust_id, similar[0], similar[1]])

lookalike_df = pd.DataFrame(lookalikes, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

lookalike_df.to_csv('FirstName_LastName_Lookalike.csv', index=False)
print(lookalike_df.head())


  CustomerID LookalikeCustomerID  SimilarityScore
0      C0001               C0173         0.999982
1      C0001               C0177         0.999914
2      C0001               C0122         0.999843
3      C0002               C0030         0.999915
4      C0002               C0029         0.999896


In [None]:
lookalike_df.to_csv('FirstName_LastName_Lookalike.csv', index=False)


In [None]:
from google.colab import files
files.download('FirstName_LastName_Lookalike.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>