## Build a LOOKALIKE MODEL for customer recommendations based on both customer profile and transaction history, we can approach it using a Collaborative Filtering or Content-Based Filtering approach with Similarity Measurement.

In [17]:
# import necessary libraries
import pandas as pd
import numpy as np

# import datasets
customers = pd.read_csv('customers.csv')  
products = pd.read_csv('products.csv')
transactions = pd.read_csv('transactions.csv')

In [18]:

# Convert dates to datetime
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

### Content-based filtering is a recommendation system that suggests items to users based on their preferences and previous actions. It uses machine learning algorithms to analyze user profiles and item features to make recommendations. 


### Step 1: Data Cleaning and Preprocessing

In [19]:
print("Customers dataset null values\n",customers.isnull().sum()) 
print("Products dataset null values\n",products.isnull().sum()) 
print("Transactions dataset null values\n",transactions.isnull().sum()) 

Customers dataset null values
 CustomerID              0
CustomerName            0
Region                  0
SignupDate              0
Region_Asia             0
Region_Europe           0
Region_North America    0
Region_South America    0
dtype: int64
Products dataset null values
 ProductID      0
ProductName    0
Category       0
Price          0
dtype: int64
Transactions dataset null values
 TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price              0
dtype: int64


In [20]:
# Merge all datasets into a single dataframe
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')



In [21]:
print(merged_data)

    TransactionID CustomerID ProductID     TransactionDate  Quantity  \
0          T00001      C0199      P067 2024-08-25 12:38:23         1   
1          T00112      C0146      P067 2024-05-27 22:23:54         1   
2          T00166      C0127      P067 2024-04-25 07:38:55         1   
3          T00272      C0087      P067 2024-03-26 22:55:37         2   
4          T00363      C0070      P067 2024-03-21 15:10:10         3   
..            ...        ...       ...                 ...       ...   
995        T00496      C0118      P037 2024-10-24 08:30:27         1   
996        T00759      C0059      P037 2024-06-04 02:15:24         3   
997        T00922      C0018      P037 2024-04-05 13:05:32         4   
998        T00959      C0115      P037 2024-09-29 10:16:02         2   
999        T00992      C0024      P037 2024-04-21 10:52:24         1   

     TotalValue  Price_x          CustomerName         Region SignupDate  \
0        300.68   300.68        Andrea Jenkins         Euro

### Step 2: Feature Engineering

In [22]:
from sklearn.preprocessing import OneHotEncoder

# Create an encoder with sparse_output=False to get a dense array
encoder = OneHotEncoder(sparse_output=False)

# Apply the encoding on the 'Region' column
region_encoded = encoder.fit_transform(customers[['Region']])

# Convert the encoded data into a DataFrame
region_df = pd.DataFrame(region_encoded, columns=encoder.get_feature_names_out(['Region']))

# Concatenate this encoded data back with the original customers DataFrame
customers = pd.concat([customers, region_df], axis=1)


### Step 3: Transaction History Vector

In [23]:
# Aggregate transaction data by customer and product category
customer_transactions = merged_data.groupby(['CustomerID', 'Category'])['Quantity'].sum().unstack().fillna(0)


### Step 4: Similarity Calculation using Cosine Similarity

In [24]:
# Calculate cosine similarity between customers' transaction histories
from sklearn.metrics.pairwise import cosine_similarity
similarity_scores = cosine_similarity(customer_transactions)

### Step 5: Lookalike Model Development

In [25]:
lookalike_map = {}

# For the first 20 customers (C0001 - C0020)
for customer_id in customers['CustomerID'][:20]:
    customer_idx = customers[customers['CustomerID'] == customer_id].index[0]
    similarity_with_others = similarity_scores[customer_idx]
    
    # Get top 3 similar customers
    top_similarities = np.argsort(similarity_with_others)[::-1][1:4]  # Skip the first one (self)
    top_similar_customers = customers.iloc[top_similarities]
    
    # Collect lookalike customer IDs and their similarity scores
    lookalike_map[customer_id] = [(top_similar_customers.iloc[i]['CustomerID'], similarity_with_others[top_similarities[i]]) for i in range(3)]


### Step 6: Prepare the Lookalike.csv file

In [26]:
lookalike_data = []

# For each customer, map top 3 lookalikes with their similarity scores
for customer_id, lookalikes in lookalike_map.items():
    lookalike_row = {'CustomerID': customer_id}
    for i, (lookalike_id, score) in enumerate(lookalikes):
        lookalike_row[f'Lookalike_{i+1}'] = lookalike_id
        lookalike_row[f'Score_{i+1}'] = round(score, 4)
    lookalike_data.append(lookalike_row)

In [27]:
# Convert to DataFrame and save to CSV
lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv('Lookalike.csv', index=False)

# Print the result for inspection
print(lookalike_df.head())

  CustomerID Lookalike_1  Score_1 Lookalike_2  Score_2 Lookalike_3  Score_3
0      C0001       C0157   0.9924       C0051   0.9773       C0196   0.9655
1      C0002       C0159   0.9989       C0133   0.9943       C0178   0.9845
2      C0003       C0170   0.9927       C0100   0.9857       C0194   0.9725
3      C0004       C0146   0.9984       C0119   0.9870       C0047   0.9707
4      C0005       C0007   0.9899       C0196   0.9839       C0127   0.9802


### very high similarity scores, mostly in the range of 0.98 to 1.00


In [29]:
print(lookalike_df)

   CustomerID Lookalike_1  Score_1 Lookalike_2  Score_2 Lookalike_3  Score_3
0       C0001       C0157   0.9924       C0051   0.9773       C0196   0.9655
1       C0002       C0159   0.9989       C0133   0.9943       C0178   0.9845
2       C0003       C0170   0.9927       C0100   0.9857       C0194   0.9725
3       C0004       C0146   0.9984       C0119   0.9870       C0047   0.9707
4       C0005       C0007   0.9899       C0196   0.9839       C0127   0.9802
5       C0006       C0135   0.9899       C0024   0.9878       C0082   0.9799
6       C0007       C0127   0.9901       C0005   0.9899       C0163   0.9878
7       C0008       C0149   0.9848       C0031   0.9800       C0129   0.9760
8       C0009       C0009   1.0000       C0092   0.9983       C0040   0.9762
9       C0010       C0010   1.0000       C0077   0.9995       C0033   0.9806
10      C0011       C0027   0.9924       C0016   0.9816       C0074   0.9793
11      C0012       C0065   0.9889       C0116   0.9792       C0179   0.9733

### The Lookalike Model is effective in recommending customers who are similar in their purchasing behavior and profile. These recommendations could be used for targeted marketing, personalized offers, or inventory management ,by understanding similar customers’ preferences).
The Similarity Score allows businesses to prioritize which lookalikes might be the most relevant for specific purposes (e.g., marketing campaign targeting, product recommendations).