In [1]:
import pandas as pd
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

In [4]:
train_df = pd.read_csv(".././Training-Testing Dataset/Training Data (RWS Oversampling).csv")
test_df = pd.read_csv(".././Training-Testing Dataset/Testing Data CC undersampling.csv")

target_col = "target"

# Separate features and target
X_train = train_df.drop(columns=[target_col]).values
X_test  = test_df.drop(columns=[target_col]).values

y_train = train_df[target_col].values
y_test  = test_df[target_col].values


for i, col in enumerate(train_df.drop(columns=[target_col]).columns):
    print(i, "-", col)


cat_idxs = [i for i, col in enumerate(train_df.drop(columns=[target_col]).columns)
            if train_df[col].dtype == 'object']


for idx in cat_idxs:
    all_vals = pd.concat([train_df.iloc[:, idx], test_df.iloc[:, idx]])
    codes, _ = pd.factorize(all_vals)
    X_train[:, idx] = codes[:len(train_df)]
    X_test[:, idx]  = codes[len(train_df):]


X_train = X_train.astype(np.float32)
X_test  = X_test.astype(np.float32)


all_targets = pd.concat([train_df[target_col], test_df[target_col]])
y_codes, _ = pd.factorize(all_targets)
y_train = y_codes[:len(train_df)].astype(np.int64)
y_test  = y_codes[len(train_df):].astype(np.int64)


0 - Age_at_Marriage
1 - Gender
2 - Education_Level
3 - Caste_Match
4 - Religion
5 - Urban_Rural
6 - Income_Level
7 - Spouse_Working
8 - Inter-Caste
9 - Inter-Religion


In [5]:
model = TabNetClassifier(
    n_d=8,n_a=8,n_steps=3,
    gamma=1.3,
    n_independent=2,n_shared=2,
    optimizer_params=dict(lr=1e-2),
    mask_type='entmax'
)

model.fit(
    X_train,y_train,
    eval_set=[(X_test,y_test)],
    eval_name=["test"],
    eval_metric=["accuracy"],
    max_epochs=100,
    patience=20,
    batch_size=256,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)



epoch 0  | loss: 1.81364 | test_accuracy: 0.14517 |  0:00:04s
epoch 1  | loss: 1.75313 | test_accuracy: 0.17286 |  0:00:08s
epoch 2  | loss: 1.71961 | test_accuracy: 0.19541 |  0:00:11s
epoch 3  | loss: 1.67877 | test_accuracy: 0.2322  |  0:00:15s
epoch 4  | loss: 1.65368 | test_accuracy: 0.23695 |  0:00:18s
epoch 5  | loss: 1.62617 | test_accuracy: 0.24763 |  0:00:21s
epoch 6  | loss: 1.60021 | test_accuracy: 0.26068 |  0:00:24s
epoch 7  | loss: 1.57416 | test_accuracy: 0.26266 |  0:00:27s
epoch 8  | loss: 1.56491 | test_accuracy: 0.27255 |  0:00:30s
epoch 9  | loss: 1.52951 | test_accuracy: 0.28441 |  0:00:32s
epoch 10 | loss: 1.52684 | test_accuracy: 0.28956 |  0:00:35s
epoch 11 | loss: 1.49985 | test_accuracy: 0.3034  |  0:00:38s
epoch 12 | loss: 1.49287 | test_accuracy: 0.30934 |  0:00:46s
epoch 13 | loss: 1.48121 | test_accuracy: 0.31725 |  0:00:49s
epoch 14 | loss: 1.46613 | test_accuracy: 0.31646 |  0:00:52s
epoch 15 | loss: 1.46219 | test_accuracy: 0.31131 |  0:00:55s
epoch 16



In [6]:
model.save_model("tabnet_matchmaking")

Successfully saved model at tabnet_matchmaking.zip.zip


'tabnet_matchmaking.zip.zip'

In [19]:
model = TabNetClassifier()
model.load_model("tabnet_matchmaking.zip")



In [23]:
test_df_cc = pd.read_csv(".././Training-Testing Dataset/Testing data Undersampled(All).csv")


x_test_cc = test_df_cc.drop(columns=[target_col]).values
y_test_cc = test_df_cc[target_col].values

for idx in cat_idxs:
    test_unique_vals = train_df.iloc[:, idx].unique()
    mapping = {val: code for code, val in enumerate(test_unique_vals)}
    x_test_cc[:, idx] = [mapping.get(v, -1) for v in x_test_cc[:, idx]]

x_test_cc = x_test_cc.astype(np.float32)

train_target_unique = train_df[target_col].unique()
target_mapping = {val: code for code, val in enumerate(train_target_unique)}
y_test_cc = np.array([target_mapping.get(v, -1) for v in y_test_cc], dtype=np.int64)

y_test_cc = y_test_cc.astype(np.int64)
  
print("x_test_cc shape:", x_test_cc.shape)
print("y_test_cc shape:", y_test_cc.shape)


x_test_cc shape: (1128, 10)
y_test_cc shape: (1128,)


In [24]:
# Predict using your trained TabNet model
y_pred_cc = model.predict(x_test_cc)

acc = accuracy_score(y_test_cc, y_pred_cc)
print("CC Test Accuracy:", acc)

# Detailed report
print(classification_report(y_test_cc, y_pred_cc))

CC Test Accuracy: 0.5576241134751773
              precision    recall  f1-score   support

           0       0.53      0.91      0.67       188
           1       0.33      0.27      0.30       188
           2       0.27      0.09      0.14       188
           3       0.43      0.16      0.23       188
           4       0.74      0.99      0.85       188
           5       0.65      0.93      0.77       188

    accuracy                           0.56      1128
   macro avg       0.49      0.56      0.49      1128
weighted avg       0.49      0.56      0.49      1128

