In [52]:
import numpy as np
import pandas as pd 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier



#### Data Import

In [53]:
data = pd.read_csv('D:/Deep_Learning/Keras/PPNCKH/Data Preprocessing/train_dataset.csv')
test = pd.read_csv('D:/Deep_Learning/Keras/PPNCKH/Data Preprocessing/test_dataset.csv')

####  Date Time

In [54]:
def date_time(df):
    date_columns = ['Date_Registered', 'payment_datetime', 'purchased_datetime', 
                    'released_date', 'estimated_delivery_date', 'received_date']
    for col in date_columns:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')

    return df

data = date_time(data)
test= date_time(test)

In [56]:
# Remove rows that are duplicates when excluding the 'id' column
data = data[~data.drop(columns=['id']).duplicated()]

In [57]:
data.isnull().sum()

id                                          0
user_id                                     0
age                                         0
Gender                                      0
Date_Registered                             0
Is_current_loyalty_program_member           0
loyalty_points_redeemed                     0
loyalty_tier                           102062
Received_tier_discount_percentage      103374
Received_card_discount_percentage      156872
Received_coupon_discount_percentage         0
product_category                            0
Product_value                               0
transaction_id                              0
order_id                                    0
payment_method                              0
payment_datetime                            0
purchased_datetime                          0
purchase_medium                             0
final_payment                               0
released_date                               0
estimated_delivery_date           

In [58]:
data = data[data['age'] > 0]

In [59]:
data.fillna({
    'Received_tier_discount_percentage': 0,
    'Received_card_discount_percentage': 0,
    'Received_coupon_discount_percentage': 0,
    'loyalty_tier': -1
}, inplace=True)

test.fillna({
    'Received_tier_discount_percentage': 0,
    'Received_card_discount_percentage': 0,
    'Received_coupon_discount_percentage': 0,
    'loyalty_tier': -1
}, inplace=True)

#### Concat

In [60]:
df = pd.concat([data,test], axis =0)

In [61]:
df = df.drop(columns=['tracking_number', 'order_id'])

In [62]:
# Optimizing data types for df
df['id'] = df['id'].astype('Int32')
df['age'] = df['age'].astype('Int8')
df['loyalty_points_redeemed'] = df['loyalty_points_redeemed'].astype('Int16')
df['Received_coupon_discount_percentage'] = df['Received_coupon_discount_percentage'].astype('Int16')
df['Product_value'] = df['Product_value'].astype('Int32')
df['final_payment'] = df['final_payment'].astype('float32')
df['loyalty_tier'] = df['loyalty_tier'].astype('float32')
df['Received_tier_discount_percentage'] = df['Received_tier_discount_percentage'].astype('float32')
df['Received_card_discount_percentage'] = df['Received_card_discount_percentage'].astype('float32')

#### Categorical Encoding

In [63]:
df['shipping_method'] = pd.Categorical(df['shipping_method'], categories=['standard', 'express'], ordered=True)
df['shipping_method'] = df['shipping_method'].cat.codes

df['customer_experience'] = pd.Categorical(df['customer_experience'], categories=['bad', 'neutral','good'], ordered=True)
df['customer_experience'] = df['customer_experience'].cat.codes

#### One Hot Encoding

In [65]:
from sklearn.preprocessing import OneHotEncoder

def one_hot_encode_and_add(df, column):
    one_hot_encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoded = one_hot_encoder.fit_transform(df[[column]])
    encoded_columns = pd.DataFrame(one_hot_encoded, columns=one_hot_encoder.get_feature_names_out([column]))
    encoded_columns.index = df.index
    df = pd.concat([df, encoded_columns], axis=1)
    df = df.drop(columns=[column])
    return df

columns_to_encode = ['Gender', 'Is_current_loyalty_program_member', 'purchase_medium']

for col in columns_to_encode:
    df = one_hot_encode_and_add(df, col)

#### Separate df dataframe 

In [66]:
data = df[df['customer_experience'] != -1]
test = df[df['customer_experience'] == -1]
test=test.drop(columns=['customer_experience'])

In [69]:
data = data[data['Product_value'] <= 10000]
data = data[data['Received_card_discount_percentage'] <= 5]
data = data[data['Received_coupon_discount_percentage'] <= 5]

#### Feature Engineering

In [70]:
def process_dataframe(df):
    import pandas as pd

    # --- Discount & Price Features ---
    discount_cols = ['Received_tier_discount_percentage', 'Received_card_discount_percentage', 'Received_coupon_discount_percentage']
    df['total_discount_percentage'] = df[discount_cols].sum(axis=1)
    df['Total_Discount_Amount'] = df['Product_value'] * (df['total_discount_percentage'] / 100)
    df['discount_amount_ratio'] = df['Total_Discount_Amount'] / (df['Product_value'] + 1e-9)
    df['high_discount_order'] = (df['Total_Discount_Amount'] > df['Total_Discount_Amount'].median()).astype(int)
    df['shipping_cost'] = df['final_payment'] - (df['Product_value'] - df['Total_Discount_Amount'])
    df['shipping_cost_ratio'] = df['shipping_cost'] / df['Product_value']
    df['price_tier'] = pd.qcut(df['Product_value'], q=5, labels=range(5))
    df['discount_types_used'] = (df[discount_cols] > 0).sum(axis=1)

    # --- Loyalty Features ---
    df['loyalty_engagement_score'] = df['loyalty_points_redeemed'] / (df['Product_value'] + 1e-9)

    # --- Temporal Features ---
    dt = df['purchased_datetime'].dt
    df['purchase_hour'] = dt.hour
    df['purchase_day_of_week'] = dt.dayofweek
    df['is_weekend_purchase'] = df['purchase_day_of_week'].isin([5, 6]).astype(int)
    df['is_business_hours'] = df['purchase_hour'].between(9, 17).astype(int)
    df['purchase_month'] = dt.month
    df['purchase_quarter'] = dt.quarter

    # --- Delivery Features ---
    df['receive_day_of_week'] = df['received_date'].dt.dayofweek
    df['processing_days'] = (df['released_date'] - df['purchased_datetime']).dt.days
    df['delivery_days'] = (df['received_date'] - df['released_date']).dt.days
    df['total_order_days'] = (df['received_date'] - df['purchased_datetime']).dt.days
    df['delivery_delay'] = (df['received_date'] - df['estimated_delivery_date']).dt.days
    df['is_delayed'] = (df['delivery_delay'] > 0).astype(int)

    # --- Customer Features ---
    df['customer_tenure_days'] = (df['purchased_datetime'] - df['Date_Registered']).dt.days
    df['age_group'] = pd.cut(df['age'], bins=[0, 25, 35, 50, 65], labels=range(4))

    # --- Purchase History ---
    df['is_first_purchase'] = df.groupby('user_id')['purchased_datetime'].cumcount().eq(0).astype(int)
    df['purchase_count'] = df.groupby('user_id')['transaction_id'].transform('count')

    # --- Product Features ---
    df['product_category_encoded'] = pd.factorize(df['product_category'])[0]

    return df


In [71]:
# Apply the function to both dataframes
data = process_dataframe(data)
test = process_dataframe(test)

In [72]:
data =data.drop(columns=['user_id','transaction_id','product_category','payment_method'])
test =test.drop(columns=['user_id','transaction_id','product_category','payment_method'])

#### MI Score

In [75]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X, y):
    X = X.copy()
    X = X.dropna()  
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

In [76]:
X = data.drop(['id','customer_experience','Date_Registered', 'payment_datetime', 'purchased_datetime', 
                    'released_date', 'estimated_delivery_date', 'received_date'], axis=1)
y = data['customer_experience']

# Prepare Data for modeling

In [77]:
selected_features=['loyalty_points_redeemed',
       'loyalty_tier', 'Received_tier_discount_percentage',
       'Received_card_discount_percentage',
       'Received_coupon_discount_percentage', 'Product_value',
        'final_payment','shipping_method', 'Gender_F', 'Gender_M',
       'Gender_O', 'Is_current_loyalty_program_member_NO',
       'Is_current_loyalty_program_member_YES', 'purchase_medium_in-store',
       'purchase_medium_online', 'total_discount_percentage',
       'Total_Discount_Amount', 'discount_amount_ratio', 'high_discount_order',
       'shipping_cost', 'shipping_cost_ratio', 'price_tier',
       'discount_types_used', 'loyalty_engagement_score', 'purchase_hour',
       'purchase_day_of_week', 'is_weekend_purchase', 'is_business_hours',
       'purchase_month', 'purchase_quarter', 'receive_day_of_week',
       'processing_days', 'delivery_days', 'total_order_days',
       'delivery_delay', 'is_delayed', 'customer_tenure_days', 'age_group',
       'is_first_purchase', 'purchase_count', 'product_category_encoded']

In [78]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Create feature set
X = data[selected_features].copy()

# Target variable
y = data['customer_experience']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

In [None]:
# Define models with optimized parameters
models = {
    'LightGBM': LGBMClassifier(
        objective='multiclass', 
        num_class=3,
        learning_rate=0.1,
        n_estimators=90,
        num_leaves=64,
        feature_fraction=0.9,
        bagging_fraction=0.9,
        lambda_l1=0.1,
        lambda_l2=0.1,
        random_state=42
    ),
    'RandomForest': RandomForestClassifier(
         n_estimators=200,
         max_depth=10,
         min_samples_split=5,
         min_samples_leaf=2,
         random_state=42
    )
}


In [None]:

def evaluate_model(model, X_train, X_test, y_train, y_test, model_name, output_file):
    with open(output_file, 'a', encoding='utf-8') as f:
        f.write(f"\n========== {model_name} ==========\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        f.write(f"Accuracy: {acc:.4f}\n")
        f.write(f"F1 Score: {f1:.4f}\n\n")
        f.write("Classification Report:\n")
        f.write(classification_report(y_test, y_pred))
        f.write("\nConfusion Matrix:\n")
        f.write(str(confusion_matrix(y_test, y_pred)))
        f.write("\n")


In [None]:
output_file = "model_results.txt"
with open(output_file, 'w'):
    pass

for model_name, model in models.items():
    evaluate_model(model, X_train, X_test, y_train, y_test, model_name, output_file)
