<a href="https://colab.research.google.com/github/kishore-n-george/tabular-llm-comparison/blob/main/online_shoppersset_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 %% [markdown]
# Dry Bean online_shoppersset - online_shoppers Cleansing & Preparation & XGB Classification
# **Author:** Kishore George
# **Date:** DD-MM-YYYY  
# **online_shoppersset Source:** Koklu, M. and Ozkan, I.A. (2020)

In [1]:
#python3 -m venv tabular
#!pip install jupyter numpy pandas scikit-learn xgboost torch transformers shap scikit-learn seaborn matplotlib ucimlrepo xgboost tabpfn rtdl torch lime folium eli5 datasets accelerate peft
# torchvision torchaudio
# ft_transformer

#linear algebra
import numpy as np 
import math
import time

#online_shoppers tools
from copy import copy
import pandas as pd
from scipy.stats import boxcox
from scipy.special import boxcox1p
from scipy.special import inv_boxcox
from sklearn.preprocessing import PowerTransformer, RobustScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from IPython.display import Image

#plots
import seaborn as sns
import matplotlib.pyplot as plt

#models

import xgboost as xgb
from tabpfn import TabPFNClassifier

#model interpretation modules
import lime
import lime.lime_tabular
import shap
# import eli5

#metrics
from sklearn.metrics import mean_squared_error, r2_score,accuracy_score,classification_report, confusion_matrix, precision_score, recall_score, f1_score

#awesome interactive map library
import folium
from folium.plugins import HeatMap
from folium.plugins import FastMarkerCluster

#statistics
from scipy import stats

#ucimlrepo
from ucimlrepo import fetch_ucirepo, dotdict

%matplotlib inline
sns.set_style("whitegrid")

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [None]:

# %% [markdown]
# ## 2. Load online_shoppersset
online_original = fetch_ucirepo(id=468)
online_shoppers = online_original.data.original
print("Dataset shape:", online_shoppers.shape)
online_shoppers.head()



In [None]:

# online_shoppers preprocessing
# Encoding categorical features
label_encoder = LabelEncoder()
online_shoppers['Month'] = label_encoder.fit_transform(online_shoppers['Month'])
online_shoppers['VisitorType'] = label_encoder.fit_transform(online_shoppers['VisitorType'])
online_shoppers['Weekend'] = online_shoppers['Weekend'].astype(int)

# Define features and target variable
X_shoppers = online_shoppers.drop(columns=['Revenue'])  # Features
y_shoppers = online_shoppers['Revenue'].astype(int)  # Target

In [None]:
# Splitting dataset
X_train_shoppers, X_test_shoppers, y_train_shoppers, y_test_shoppers = train_test_split(X_shoppers, y_shoppers, test_size=0.2, random_state=42, stratify=y_shoppers)

# Scaling numerical features
scaler = StandardScaler()
X_train_shoppers = scaler.fit_transform(X_train_shoppers)
X_test_shoppers = scaler.transform(X_test_shoppers)

# Model training and evaluation
xgb_shoppers = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
start_time = time.time()
xgb_shoppers.fit(X_train_shoppers, y_train_shoppers)
xgb_shoppers_train_time = time.time() - start_time

# XGBoost Predictions
start_time = time.time()
xgb_y_pred_shoppers = xgb_shoppers.predict(X_test_shoppers)
xgb_shoppers_inference_time = time.time() - start_time


print(f"XGBoost Training time: {xgb_shoppers_train_time:.4f}")
print(f"XGBoost Inference time: {xgb_shoppers_inference_time:.4f}")

acc = accuracy_score(y_test_shoppers, xgb_y_pred_shoppers)
print(f"XGBoost Accuracy: {acc:.4f}")
print(classification_report(y_test_shoppers, xgb_y_pred_shoppers))

# Confusion Matrix
plt.figure(figsize=(5,4))
sns.heatmap(confusion_matrix(y_test_shoppers, xgb_y_pred_shoppers), annot=True, fmt='d', cmap='Blues')
plt.title("XGBoost - Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

TabPFN Implementation

In [None]:
# Train TabPFN Classifier
pfn_shoppers_model = TabPFNClassifier(device='cuda')  # Use 'cuda' if GPU is available
start_time = time.time()
pfn_shoppers_model.fit(X_train_shoppers, y_train_shoppers, overwrite_warning=True)
pfn_train_time = time.time() - start_time

# Predictions
start_time = time.time()
y_pred_shoopers_pfn = pfn_shoppers_model.predict(X_test_shoppers)
pfn_inference_time = time.time() - start_time

In [None]:
def compute_metrics(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    # auc_roc = roc_auc_score(y_true, y_pred,multi_class='ovo')
    print(f"{model_name} Performance:\n Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f} \n")
    #print(f"{model_name} Performance:\n Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}\n")


compute_metrics(y_test_shoppers, xgb_y_pred_shoppers, "XGBoost")
compute_metrics(y_test_shoppers, y_pred_shoopers_pfn, "TabPFN")


# XGBoost Performance:
#  Accuracy: 0.9043, Precision: 0.8989, Recall: 0.9043, F1 Score: 0.9004 

# TabPFN Performance:
#  Accuracy: 0.8921, Precision: 0.8857, Recall: 0.8921, F1 Score: 0.8879 

Table LLM

In [None]:
# # Load model directly
# from transformers import AutoTokenizer, AutoModelForCausalLM

# tokenizer = AutoTokenizer.from_pretrained("RUCKBReasoning/TableLLM-13b")
# tablellm_model = AutoModelForCausalLM.from_pretrained("RUCKBReasoning/TableLLM-13b")

# Load model directly
# from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
# import torch


# tokenizer = AutoTokenizer.from_pretrained("RUCKBReasoning/TableLLM-7b")
# # Enable 4-bit quantization
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,  # Use 4-bit quantization
#     bnb_4bit_compute_dtype=torch.float16,
#     bnb_4bit_use_double_quant=True,
# )

# model = AutoModelForCausalLM.from_pretrained("RUCKBReasoning/TableLLM-7b",quantization_config=bnb_config, device_map="auto")

In [2]:
## resetting to base data, removing onehot encoding
online_original = fetch_ucirepo(id=468)
online_shoppers = online_original.data.original
# Define features and target variable
#X_shoppers = online_shoppers.drop(columns=['Revenue'])  # Features
#y_shoppers = online_shoppers['Revenue'].astype(int)  # Target
#X_train_shoppers, X_test_shoppers, y_train_shoppers, y_test_shoppers = train_test_split(X_shoppers, y_shoppers, test_size=0.2, random_state=42, stratify=y_shoppers)

from datasets import Dataset

def convert_to_table_prompt(df_row):
    """Format the tabular row into a natural language prompt for TableLLM"""
    prompt = f"Given the following online shopper session details, predict whether the user will make a purchase (1) or not (0):\n\n"
    prompt += "\n".join([f"{col}: {val}" for col, val in df_row.items()])
    prompt += "\n\nPrediction:"
    return prompt

# Apply transformation
online_shoppers["Revenue"] = online_shoppers["Revenue"].astype(int)
online_shoppers["Revenue"] = online_shoppers["Revenue"].astype(str)
online_shoppers["prompt"] = online_shoppers.drop(columns=["Revenue"]).apply(convert_to_table_prompt, axis=1)

# Convert dataset to Hugging Face `Dataset` format
hf_dataset = Dataset.from_pandas(online_shoppers[["prompt", "Revenue"]])
hf_dataset = hf_dataset.rename_columns({"Revenue": "labels"})

print(hf_dataset)


Dataset({
    features: ['prompt', 'labels'],
    num_rows: 12330
})


In [3]:
print(hf_dataset[65])
#print(online_shoppers.loc[online_shoppers['Revenue'] == True])


{'prompt': 'Given the following online shopper session details, predict whether the user will make a purchase (1) or not (0):\n\nAdministrative: 3\nAdministrative_Duration: 87.83333333\nInformational: 0\nInformational_Duration: 0.0\nProductRelated: 27\nProductRelated_Duration: 798.3333333\nBounceRates: 0.0\nExitRates: 0.012643678\nPageValues: 22.9160357\nSpecialDay: 0.8\nMonth: Feb\nOperatingSystems: 2\nBrowser: 2\nRegion: 3\nTrafficType: 1\nVisitorType: Returning_Visitor\nWeekend: False\n\nPrediction:', 'labels': '1'}


# Fine Tune Table LLM

In [4]:
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq, AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
import torch

model_name = "RUCKBReasoning/TableLLM-7b"

# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto", offload_folder="offload",)

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, llm_int8_enable_fp32_cpu_offload=True )

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16,
    offload_folder="offload"
)
# 

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
from peft import LoraConfig, get_peft_model
!export CUDA_LAUNCH_BLOCKING=1

# Define LoRA Configuration
lora_config = LoraConfig(
    r=8,  # Rank of the LoRA update matrices
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # Apply LoRA only to attention layers
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"  # AutoModelForCausalLM is a causal language model
)

# Attach LoRA adapters
peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()

trainable params: 4,194,304 || all params: 6,742,740,992 || trainable%: 0.0622


In [6]:
# Tokenize dataset
def preprocess_function(examples):
    model_inputs = tokenizer(examples["prompt"], padding="max_length", truncation=True)
    model_inputs["labels"] = tokenizer(examples["labels"], padding="max_length", truncation=True)["input_ids"]
    return model_inputs

tokenized_dataset = hf_dataset.map(preprocess_function, batched=True)

# Data collator for training
data_collator = DataCollatorForSeq2Seq(tokenizer, model=peft_model)



Map:   0%|          | 0/12330 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
for name, param in peft_model.named_parameters():
    if param.requires_grad:
        print(name, param.shape)  # Should print only LoRA parameters


In [7]:
from transformers import TrainingArguments, Trainer
# !export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

# torch.cuda.empty_cache()

# Assuming 'peft_model' is your model
# peft_model.to_empty(device='cpu')  # If the model is in a meta state, use this

# Move the model to the desired device (cuda, cpu)
# peft_model.to('cpu')  # Move the model to the target device

# tokenized_dataset.to('cpu')

In [8]:
training_args = TrainingArguments(
    output_dir="./fine_tuned_tablellm",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,  # Use a slightly higher learning rate for LoRA
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    push_to_hub=False,
    gradient_accumulation_steps=4
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset,  # Make sure dataset is properly tokenized
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [13]:
trainer.train()

/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [47,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [47,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [47,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [47,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [47,0,0], thread: [36,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [47,0,0], thread: [37,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422:

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
#Saving fine-tuned model
trainer.save_model("./fine_tuned_tablellm")
tokenizer.save_pretrained("./fine_tuned_tablellm")


In [None]:
import torch
# Convert tabular data to text format for Table LLM
def convert_to_table_prompt(df_row):
    prompt = f"Given the following online shopper session details, predict whether the user will make a purchase (1) or not (0):\n\n"
    prompt += "\n".join([f"{col}: {val}" for col, val in df_row.items()])
    prompt += "\n\nPrediction:"
    return prompt


features = online_shoppers.drop(columns=["Revenue"])

X_test_shoppers = pd.DataFrame(X_test_shoppers, columns=features.columns) 
X_test_text = X_test_shoppers.apply(convert_to_table_prompt, axis=1).tolist()

print(X_test_text)
X_test_text_sampled = X_test_text[:10]
y_test_sampled = y_test_shoppers[:10]
print(X_test_text_sampled)
print(y_test_sampled)

In [None]:
# Tokenize and generate predictions
import re
y_pred_tablellm = []
for text in X_test_text_sampled:
# text = "Given the following online shopper session details, predict whether the user will make a purchase (1) or not (0).:\n\nAdministrative: 1\nAdministrative_Duration: 4.0\nInformational: 0\nInformational_Duration: 0.0\nProductRelated: 13\nProductRelated_Duration: 161.1666667\nBounceRates: 0.024615385\nExitRates: 0.061538462\nPageValues: 0.0\nSpecialDay: 0.6\nMonth: May\nOperatingSystems: 2\nBrowser: 5\nRegion: 9\nTrafficType: 5\nVisitorType: Returning_Visitor\nWeekend: False\n\nPrediction:"
    inputs = tokenizer(text, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=256)
    y_preds = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(y_preds)
    # Regular expression to match the 'Prediction' key and extract its value
    match = re.search(r'Prediction:\s*(\d+)', y_preds)

    if match:
        prediction_value = match.group(1)
        print(prediction_value)
        y_pred_tablellm.append(prediction_value)
    else:
        print("Prediction key not found")
        y_pred_tablellm.append(-1)


# Evaluate Performance
accuracy = accuracy_score(y_test_sampled, y_pred_tablellm)
# print(f"TableLLM 13B Accuracy: {accuracy:.4f}")

In [None]:
print(accuracy)
print(y_pred_tablellm)
print(y_test_sampled)