In [2]:
import pandas as pd
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

In [1]:
!pip install pytorch-tabnet


Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Collecting tqdm>=4.36 (from pytorch-tabnet)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading pytorch_tabnet-4.1.0-py3-none-any.whl (44 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, pytorch-tabnet
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [pytorch-tabnet]m [pytorch-tabnet]
[1A[2KSuccessfully installed pytorch-tabnet-4.1.0 tqdm-4.67.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [3]:
train_df = pd.read_csv(".././Training-Testing Dataset/Training Data (RWS Oversampling).csv")
test_df = pd.read_csv(".././Training-Testing Dataset/Testing Data (RWS Oversampling).csv")

target_col = "target"

# Separate features and target
X_train = train_df.drop(columns=[target_col]).values
X_test  = test_df.drop(columns=[target_col]).values

y_train = train_df[target_col].values
y_test  = test_df[target_col].values


for i, col in enumerate(train_df.drop(columns=[target_col]).columns):
    print(i, "-", col)


cat_idxs = [i for i, col in enumerate(train_df.drop(columns=[target_col]).columns)
            if train_df[col].dtype == 'object']


for idx in cat_idxs:
    all_vals = pd.concat([train_df.iloc[:, idx], test_df.iloc[:, idx]])
    codes, _ = pd.factorize(all_vals)
    X_train[:, idx] = codes[:len(train_df)]
    X_test[:, idx]  = codes[len(train_df):]


X_train = X_train.astype(np.float32)
X_test  = X_test.astype(np.float32)


all_targets = pd.concat([train_df[target_col], test_df[target_col]])
y_codes, _ = pd.factorize(all_targets)
y_train = y_codes[:len(train_df)].astype(np.int64)
y_test  = y_codes[len(train_df):].astype(np.int64)


0 - Age_at_Marriage
1 - Gender
2 - Education_Level
3 - Caste_Match
4 - Religion
5 - Urban_Rural
6 - Income_Level
7 - Spouse_Working
8 - Inter-Caste
9 - Inter-Religion


In [4]:
model = TabNetClassifier(
    n_d=8,n_a=8,n_steps=3,
    gamma=1.3,
    n_independent=2,n_shared=2,
    optimizer_params=dict(lr=1e-2),
    mask_type='entmax'
)

model.fit(
    X_train,y_train,
    eval_set=[(X_test,y_test)],
    eval_name=["test"],
    eval_metric=["accuracy"],
    max_epochs=100,
    patience=20,
    batch_size=256,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)



epoch 0  | loss: 1.79491 | test_accuracy: 0.17711 |  0:00:04s
epoch 1  | loss: 1.66815 | test_accuracy: 0.25594 |  0:00:05s
epoch 2  | loss: 1.58102 | test_accuracy: 0.39059 |  0:00:07s
epoch 3  | loss: 1.50451 | test_accuracy: 0.41216 |  0:00:09s
epoch 4  | loss: 1.43516 | test_accuracy: 0.44816 |  0:00:11s
epoch 5  | loss: 1.39702 | test_accuracy: 0.49194 |  0:00:13s
epoch 6  | loss: 1.36503 | test_accuracy: 0.49714 |  0:00:15s
epoch 7  | loss: 1.34167 | test_accuracy: 0.53329 |  0:00:17s
epoch 8  | loss: 1.31643 | test_accuracy: 0.53308 |  0:00:18s
epoch 9  | loss: 1.28557 | test_accuracy: 0.56234 |  0:00:20s
epoch 10 | loss: 1.27413 | test_accuracy: 0.5352  |  0:00:22s
epoch 11 | loss: 1.26524 | test_accuracy: 0.55444 |  0:00:24s
epoch 12 | loss: 1.24458 | test_accuracy: 0.56642 |  0:00:27s
epoch 13 | loss: 1.2259  | test_accuracy: 0.56929 |  0:00:28s
epoch 14 | loss: 1.21754 | test_accuracy: 0.58217 |  0:00:30s
epoch 15 | loss: 1.2222  | test_accuracy: 0.59388 |  0:00:32s
epoch 16



In [5]:
test_df_cc = pd.read_csv(".././Training-Testing Dataset/Testing Data (CC undersampling).csv")


x_test_cc = test_df_cc.drop(columns=[target_col]).values
y_test_cc = test_df_cc[target_col].values

for idx in cat_idxs:
    test_unique_vals = train_df.iloc[:, idx].unique()
    mapping = {val: code for code, val in enumerate(test_unique_vals)}
    x_test_cc[:, idx] = [mapping.get(v, -1) for v in x_test_cc[:, idx]]

x_test_cc = x_test_cc.astype(np.float32)

train_target_unique = train_df[target_col].unique()
target_mapping = {val: code for code, val in enumerate(train_target_unique)}
y_test_cc = np.array([target_mapping.get(v, -1) for v in y_test_cc], dtype=np.int64)

y_test_cc = y_test_cc.astype(np.int64)
  
print("x_test_cc shape:", x_test_cc.shape)
print("y_test_cc shape:", y_test_cc.shape)


x_test_cc shape: (336, 10)
y_test_cc shape: (336,)


In [6]:
# Predict using your trained TabNet model
y_pred_cc = model.predict(x_test_cc)

acc = accuracy_score(y_test_cc, y_pred_cc)
print("CC Test Accuracy:", acc)

# Detailed report
print(classification_report(y_test_cc, y_pred_cc))

CC Test Accuracy: 0.5327380952380952
              precision    recall  f1-score   support

           0       0.26      0.62      0.37        56
           1       0.70      0.46      0.56        56
           2       0.31      0.36      0.33        56
           3       0.00      0.00      0.00        56
           4       0.97      1.00      0.98        56
           5       0.98      0.75      0.85        56

    accuracy                           0.53       336
   macro avg       0.54      0.53      0.52       336
weighted avg       0.54      0.53      0.52       336



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
