In [1]:
import pandas as pd
import numpy as np
import sys
import os
sys.path.append(os.path.abspath(".."))
import matplotlib.pyplot as plt
import seaborn as sns
from src.data_processing import create_aggregate_features, extract_time_features, build_pipeline
# Load the uploaded dataset
df = pd.read_csv('../data/raw/data.csv')

In [2]:

from datetime import datetime

# Assuming df is your transaction data with 'CustomerId' and 'TransactionStartTime' columns

df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])

# Define snapshot date (analysis date)
snapshot_date = df['TransactionStartTime'].max() + pd.Timedelta(days=1)

# Calculate RFM
rfm = df.groupby('CustomerId').agg({
    'TransactionStartTime': lambda x: (snapshot_date - x.max()).days,
    'TransactionId': 'count',
    'Amount': 'sum'
}).reset_index()

rfm.columns = ['CustomerId', 'Recency', 'Frequency', 'Monetary']

rfm.head()


Unnamed: 0,CustomerId,Recency,Frequency,Monetary
0,CustomerId_1,84,1,-10000.0
1,CustomerId_10,84,1,-10000.0
2,CustomerId_1001,90,5,20000.0
3,CustomerId_1002,26,11,4225.0
4,CustomerId_1003,12,6,20000.0


In [3]:
from sklearn.preprocessing import StandardScaler

features = ['Recency', 'Frequency', 'Monetary']
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[features])


In [4]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=42)
rfm['Cluster'] = kmeans.fit_predict(rfm_scaled)

rfm.head()


  super()._check_params_vs_input(X, default_n_init=10)


Unnamed: 0,CustomerId,Recency,Frequency,Monetary,Cluster
0,CustomerId_1,84,1,-10000.0,0
1,CustomerId_10,84,1,-10000.0,0
2,CustomerId_1001,90,5,20000.0,0
3,CustomerId_1002,26,11,4225.0,1
4,CustomerId_1003,12,6,20000.0,1


In [5]:
cluster_stats = rfm.groupby('Cluster').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': 'mean',
    'CustomerId': 'count'
}).reset_index()

print(cluster_stats)


   Cluster    Recency    Frequency      Monetary  CustomerId
0        0  61.877279     7.720196  8.172068e+04        1426
1        1  12.726566    34.800000  2.725741e+05        2315
2        2  29.000000  4091.000000 -1.049000e+08           1


In [6]:
# Example: Suppose Cluster 2 is least engaged
high_risk_cluster = 2

rfm['is_high_risk'] = (rfm['Cluster'] == high_risk_cluster).astype(int)
rfm.head()


Unnamed: 0,CustomerId,Recency,Frequency,Monetary,Cluster,is_high_risk
0,CustomerId_1,84,1,-10000.0,0,0
1,CustomerId_10,84,1,-10000.0,0,0
2,CustomerId_1001,90,5,20000.0,0,0
3,CustomerId_1002,26,11,4225.0,1,0
4,CustomerId_1003,12,6,20000.0,1,0


In [7]:
df = df.merge(rfm[['CustomerId', 'is_high_risk']], on='CustomerId', how='left')
df['is_high_risk'].value_counts()


is_high_risk
0    91571
1     4091
Name: count, dtype: int64

In [8]:
df.to_csv('../data/processed/processed_data.csv', index=False)
