In [11]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, train_test_split, cross_validate
from sklearn.metrics import classification_report, accuracy_score

file_path = "dataset.xlsx"
df = pd.read_excel(file_path)

df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (df['InvoiceDate'].max() - x.max()).days,
    'InvoiceNo': 'count',
    'TotalPrice': 'sum'
}).reset_index()

rfm.columns = ['CustomerID', 'Recency', 'Frequency', 'Monetary']

rfm['R_Score'] = pd.qcut(rfm['Recency'].rank(method='first'), q=4, labels=[4, 3, 2, 1])
rfm['F_Score'] = pd.qcut(rfm['Frequency'].rank(method='first'), q=4, labels=[1, 2, 3, 4])
rfm['M_Score'] = pd.qcut(rfm['Monetary'].rank(method='first'), q=4, labels=[1, 2, 3, 4])

rfm['RFM_Segment'] = rfm['R_Score'].astype(str) + rfm['F_Score'].astype(str) + rfm['M_Score'].astype(str)
rfm['RFM_Score'] = rfm[['R_Score', 'F_Score', 'M_Score']].sum(axis=1)

rfm['Segment'] = pd.cut(rfm['RFM_Score'], bins=[3, 7, 10, 12], labels=['Low', 'Medium', 'High'], include_lowest=True)

X = rfm[['Recency', 'Frequency', 'Monetary']]
y = rfm['Segment']


model=RandomForestClassifier(random_state=42)


kf = KFold(n_splits=5, shuffle=False)

y_true_all = []
y_pred_all = []

for train_index, test_index in kf.split(X):
  X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  y_train, y_test = y.iloc[train_index], y.iloc[test_index]
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  y_true_all.extend(y_test)
  y_pred_all.extend(y_pred)

accuracy = accuracy_score(y_true_all, y_pred_all)
print(f"Accuracy score: {accuracy:.4f}")
print("Classification report:")
print(classification_report(y_true_all, y_pred_all))


Accuracy score: 0.8870
Classification report:
              precision    recall  f1-score   support

        High       0.75      0.46      0.57        13
         Low       0.93      0.91      0.92       114
      Medium       0.86      0.91      0.88       112

    accuracy                           0.89       239
   macro avg       0.85      0.76      0.79       239
weighted avg       0.89      0.89      0.88       239

