In [1]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import accuracy_score

# Load data
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

ModuleNotFoundError: No module named 'sklearn'

In [1]:
%pip install scikit-learn

^C
Note: you may need to restart the kernel to use updated packages.


Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp310-cp310-win_amd64.whl (11.1 MB)
     ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
     ---------------------------------------- 0.1/11.1 MB 4.1 MB/s eta 0:00:03
     - -------------------------------------- 0.4/11.1 MB 5.4 MB/s eta 0:00:02
     --- ------------------------------------ 1.0/11.1 MB 7.6 MB/s eta 0:00:02
     ---- ----------------------------------- 1.2/11.1 MB 6.4 MB/s eta 0:00:02
     ------- -------------------------------- 2.1/11.1 MB 8.8 MB/s eta 0:00:02
     --------- ------------------------------ 2.5/11.1 MB 8.5 MB/s eta 0:00:02
     ----------- ---------------------------- 3.2/11.1 MB 9.4 MB/s eta 0:00:01
     -------------- ------------------------- 3.9/11.1 MB 10.0 MB/s eta 0:00:01
     --------------- ------------------------ 4.2/11.1 MB 9.2 MB/s eta 0:00:01
     ---------------- ----------------------- 4.6/11.1 MB 9.5 MB/s eta 0:00:01
     ---------------- ----------------------


[notice] A new release of pip is available: 23.0.1 -> 25.0
[notice] To update, run: C:\Users\kulad\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
# Preprocess dates
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Aggregate transaction data per customer
customer_transactions = transactions.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean', 'count'],
    'Quantity': 'sum'
}).reset_index()
customer_transactions.columns = ['CustomerID', 'TotalSpend', 'AvgSpend', 'TransactionCount', 'TotalQuantity']

# Join aggregated data with customers
customers = customers.merge(customer_transactions, on='CustomerID', how='left').fillna(0)

# One-hot encode categorical features
encoder = OneHotEncoder()
region_encoded = encoder.fit_transform(customers[['Region']]).toarray()
region_df = pd.DataFrame(region_encoded, columns=encoder.get_feature_names_out(['Region']))

# Concatenate encoded features
customers = pd.concat([customers, region_df], axis=1)


In [None]:
# Normalize numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customers[['TotalSpend', 'AvgSpend', 'TransactionCount', 'TotalQuantity']])
scaled_df = pd.DataFrame(scaled_features, columns=['ScaledSpend', 'ScaledAvgSpend', 'ScaledTransactionCount', 'ScaledQuantity'])

# Final feature set for similarity calculations
customers = pd.concat([customers, scaled_df], axis=1)

# Compute similarity matrix
features = customers[['ScaledSpend', 'ScaledAvgSpend', 'ScaledTransactionCount', 'ScaledQuantity'] + list(region_df.columns)]
similarity_matrix = cosine_similarity(features)


In [None]:
# Generate top 3 recommendations for each customer
lookalike_map = {}
for i, cust_id in enumerate(customers['CustomerID']):
    similarities = list(enumerate(similarity_matrix[i]))
    similarities = sorted(similarities, key=lambda x: -x[1])  # Sort by similarity score
    top_3 = [(customers.loc[idx, 'CustomerID'], round(score, 3)) for idx, score in similarities[1:4]]
    lookalike_map[cust_id] = top_3


In [None]:

# Clustering Step
kmeans = KMeans(n_clusters=4, random_state=42)  # Adjust the number of clusters if needed
customers['Cluster'] = kmeans.fit_predict(features)

# Evaluate Clustering Quality
silhouette_avg = silhouette_score(features, customers['Cluster'])
print(f"Silhouette Score for Clustering: {silhouette_avg:.2f}")

# Evaluate Lookalike Recommendations Against Clusters
def evaluate_lookalikes_with_clusters(customers, lookalike_map):
    correct_recommendations = 0
    total_recommendations = 0

    for target_id, recommendations in lookalike_map.items():
        # Get target customer cluster
        target_cluster = customers.loc[customers['CustomerID'] == target_id, 'Cluster'].values[0]
        for rec_id, _ in recommendations:
            # Get recommended customer cluster
            rec_cluster = customers.loc[customers['CustomerID'] == rec_id, 'Cluster'].values[0]
            if target_cluster == rec_cluster:
                correct_recommendations += 1
            total_recommendations += 1

    cluster_accuracy = correct_recommendations / total_recommendations if total_recommendations > 0 else 0
    return cluster_accuracy

# Calculate Cluster Accuracy
cluster_accuracy = evaluate_lookalikes_with_clusters(customers, lookalike_map)
print(f"Cluster Accuracy: {cluster_accuracy:.2%}")

# Save Lookalike Map to CSV
import csv
with open('Lookalike.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['CustomerID', 'Recommendations'])
    for cust_id, recs in lookalike_map.items():
        writer.writerow([cust_id, recs])

- Effective Customer Grouping:

The K-Means clustering method successfully segments customers based on spending behavior, with the Silhouette Score indicating the quality of clustering.

- Accurate Lookalike Recommendations:

Cosine similarity helps identify similar customers, providing reliable recommendations that align well with cluster groups.

- Importance of Data Preprocessing:

Proper handling of missing values, feature scaling, and one-hot encoding significantly improve the model's accuracy.
