In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


In [None]:
# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("products.csv")
transactions = pd.read_csv("Transactions.csv")

In [None]:
merged_data = pd.merge(transactions, customers, on='CustomerID')
merged_data = pd.merge(merged_data, products, on='ProductID')
print(merged_data.head())

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker

In [None]:
merged_data.to_csv("processed_data.csv", index=False)


In [None]:
# Load the customer and transaction data
customers = pd.read_csv('Customers.csv')
merged_data = pd.read_csv('processed_data.csv')

# Inspect the data
print(customers.head())
print(merged_data.head())


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2  

In [None]:
# Create a customer-product matrix based on the merged data
customer_product_matrix = merged_data.pivot_table(
    index='CustomerID',
    columns='ProductID',
    values='TotalValue',
    aggfunc='sum',
    fill_value=0  # Fill missing values with 0
)

print(customer_product_matrix.head())


ProductID   P001    P002  P003    P004  P005    P006  P007   P008  P009  P010  \
CustomerID                                                                      
C0001        0.0     0.0   0.0    0.00   0.0    0.00   0.0    0.0   0.0   0.0   
C0002        0.0     0.0   0.0  382.76   0.0    0.00   0.0    0.0   0.0   0.0   
C0003        0.0  1385.2   0.0    0.00   0.0  363.96   0.0    0.0   0.0   0.0   
C0004        0.0     0.0   0.0    0.00   0.0    0.00   0.0  293.7   0.0   0.0   
C0005        0.0     0.0   0.0    0.00   0.0    0.00   0.0    0.0   0.0   0.0   

ProductID   ...  P091  P092  P093  P094    P095    P096    P097  P098  P099  \
CustomerID  ...                                                               
C0001       ...   0.0   0.0   0.0   0.0    0.00  614.94    0.00   0.0   0.0   
C0002       ...   0.0   0.0   0.0   0.0  454.52    0.00    0.00   0.0   0.0   
C0003       ...   0.0   0.0   0.0   0.0    0.00    0.00    0.00   0.0   0.0   
C0004       ...   0.0   0.0   0.0   0

In [None]:
# Assuming 'Region' is a feature in the 'customers' dataframe
customer_profile = customers[['CustomerID', 'Region']]

# Merge customer profile data with the customer-product matrix
customer_features = customer_product_matrix.merge(customer_profile, on='CustomerID', how='left')

print(customer_features.head())


  CustomerID  P001    P002  P003    P004  P005    P006  P007   P008  P009  \
0      C0001   0.0     0.0   0.0    0.00   0.0    0.00   0.0    0.0   0.0   
1      C0002   0.0     0.0   0.0  382.76   0.0    0.00   0.0    0.0   0.0   
2      C0003   0.0  1385.2   0.0    0.00   0.0  363.96   0.0    0.0   0.0   
3      C0004   0.0     0.0   0.0    0.00   0.0    0.00   0.0  293.7   0.0   
4      C0005   0.0     0.0   0.0    0.00   0.0    0.00   0.0    0.0   0.0   

   ...  P092  P093  P094    P095    P096    P097  P098  P099  P100  \
0  ...   0.0   0.0   0.0    0.00  614.94    0.00   0.0   0.0   0.0   
1  ...   0.0   0.0   0.0  454.52    0.00    0.00   0.0   0.0   0.0   
2  ...   0.0   0.0   0.0    0.00    0.00    0.00   0.0   0.0   0.0   
3  ...   0.0   0.0   0.0    0.00    0.00  958.02   0.0   0.0   0.0   
4  ...   0.0   0.0   0.0    0.00    0.00    0.00   0.0   0.0   0.0   

          Region  
0  South America  
1           Asia  
2  South America  
3  South America  
4           Asia  

[

In [None]:
# Drop non-numeric columns (e.g., 'CustomerID', 'Region')
customer_features_numeric = customer_features.drop(['CustomerID', 'Region'], axis=1)

# Normalize the customer features
scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_features_numeric)

# Convert back to DataFrame
customer_features_scaled_df = pd.DataFrame(customer_features_scaled, columns=customer_features_numeric.columns)
customer_features_scaled_df['CustomerID'] = customer_features['CustomerID']

# Display the scaled customer features
print(customer_features_scaled_df.head())



       P001      P002      P003      P004      P005      P006      P007  \
0 -0.191554 -0.198313 -0.234031 -0.173045 -0.201759 -0.246148 -0.189473   
1 -0.191554 -0.198313 -0.234031  7.479386 -0.201759 -0.246148 -0.189473   
2 -0.191554  6.115967 -0.234031 -0.173045 -0.201759  3.835813 -0.189473   
3 -0.191554 -0.198313 -0.234031 -0.173045 -0.201759 -0.246148 -0.189473   
4 -0.191554 -0.198313 -0.234031 -0.173045 -0.201759 -0.246148 -0.189473   

       P008      P009     P010  ...     P092      P093      P094      P095  \
0 -0.220871 -0.172791 -0.22536  ... -0.20038 -0.178562 -0.204804 -0.178108   
1 -0.220871 -0.172791 -0.22536  ... -0.20038 -0.178562 -0.204804  3.552778   
2 -0.220871 -0.172791 -0.22536  ... -0.20038 -0.178562 -0.204804 -0.178108   
3  4.405786 -0.172791 -0.22536  ... -0.20038 -0.178562 -0.204804 -0.178108   
4 -0.220871 -0.172791 -0.22536  ... -0.20038 -0.178562 -0.204804 -0.178108   

       P096      P097      P098     P099      P100  CustomerID  
0  2.779355 -0.

In [None]:
# Calculate the cosine similarity between all customers
similarity_matrix = cosine_similarity(customer_features_scaled_df.drop('CustomerID', axis=1))

# Display the similarity matrix (first 5 rows and columns)
print(similarity_matrix[:5, :5])


[[ 1.         -0.04882928 -0.06147586 -0.07906018 -0.05168909]
 [-0.04882928  1.         -0.03569919 -0.05168252 -0.02306581]
 [-0.06147586 -0.03569919  1.          0.04022236  0.24429628]
 [-0.07906018 -0.05168252  0.04022236  1.          0.07985298]
 [-0.05168909 -0.02306581  0.24429628  0.07985298  1.        ]]


In [None]:
lookalike_data = []

# For the first 20 customers, find the top 3 lookalike customers
for idx, customer_id in enumerate(customer_product_matrix.index[:20]):  # C0001 to C0020
    # Get the indices of the top 3 most similar customers (excluding the customer itself)
    similar_indices = np.argsort(-similarity_matrix[idx])[1:4]  # Sort and exclude the customer itself
    similar_customers = customer_product_matrix.index[similar_indices]
    similarity_scores = similarity_matrix[idx][similar_indices]

    # Store the recommendations in the list
    for sim_customer, score in zip(similar_customers, similarity_scores):
        lookalike_data.append([customer_id, sim_customer, score])

# Convert the list into a DataFrame
lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

# Display the lookalike recommendations
print(lookalike_df.head())

# Save the lookalike recommendations to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)
print("Lookalike recommendations saved to 'Lookalike.csv'.")


  CustomerID LookalikeCustomerID  SimilarityScore
0      C0001               C0194         0.404928
1      C0001               C0104         0.374002
2      C0001               C0020         0.366609
3      C0002               C0030         0.404617
4      C0002               C0091         0.383778
Lookalike recommendations saved to 'Lookalike.csv'.
