In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from arc.Multiclass_Architecture import Model
import os
from data.Data_Interface import DataInterface
from model.Model_Interface_MC import ModelInterface
from cf.GradCFA_MC import GradCFA
from eval.Evaluation import Evaluate
pd.set_option('display.max_columns', None)

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [75]:
df = pd.read_csv('datasets/obesity.csv')

In [76]:
df.head()

Unnamed: 0,Age,Gender,Height,Weight,CALC,FAVC,FCVC,NCP,SCC,SMOKE,CH2O,family_history_with_overweight,FAF,TUE,CAEC,MTRANS,NObeyesdad
0,21.0,Female,1.62,64.0,no,no,2.0,3.0,no,no,2.0,yes,0.0,1.0,Sometimes,Public_Transportation,Normal_Weight
1,21.0,Female,1.52,56.0,Sometimes,no,3.0,3.0,yes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,23.0,Male,1.8,77.0,Frequently,no,2.0,3.0,no,no,2.0,yes,2.0,1.0,Sometimes,Public_Transportation,Normal_Weight
3,27.0,Male,1.8,87.0,Frequently,no,3.0,3.0,no,no,2.0,no,2.0,0.0,Sometimes,Walking,Overweight_Level_I
4,22.0,Male,1.78,89.8,Sometimes,no,2.0,1.0,no,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [77]:
df = df.rename(columns={'NObeyesdad': 'target', 'family_history_with_overweight': 'FHWO'})

In [78]:
df.head()

Unnamed: 0,Age,Gender,Height,Weight,CALC,FAVC,FCVC,NCP,SCC,SMOKE,CH2O,FHWO,FAF,TUE,CAEC,MTRANS,target
0,21.0,Female,1.62,64.0,no,no,2.0,3.0,no,no,2.0,yes,0.0,1.0,Sometimes,Public_Transportation,Normal_Weight
1,21.0,Female,1.52,56.0,Sometimes,no,3.0,3.0,yes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,23.0,Male,1.8,77.0,Frequently,no,2.0,3.0,no,no,2.0,yes,2.0,1.0,Sometimes,Public_Transportation,Normal_Weight
3,27.0,Male,1.8,87.0,Frequently,no,3.0,3.0,no,no,2.0,no,2.0,0.0,Sometimes,Walking,Overweight_Level_I
4,22.0,Male,1.78,89.8,Sometimes,no,2.0,1.0,no,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [79]:
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
label_mapping = dict(zip(le.classes_, range(len(le.classes_))))
print(label_mapping)
df['CALC'] = le.fit_transform(df['CALC'])
label_mapping = dict(zip(le.classes_, range(len(le.classes_))))
print(label_mapping)
df['FAVC'] = le.fit_transform(df['FAVC'])
label_mapping = dict(zip(le.classes_, range(len(le.classes_))))
print(label_mapping)
df['SCC'] = le.fit_transform(df['SCC'])
label_mapping = dict(zip(le.classes_, range(len(le.classes_))))
print(label_mapping)
df['SMOKE'] = le.fit_transform(df['SMOKE'])
label_mapping = dict(zip(le.classes_, range(len(le.classes_))))
print(label_mapping)
df['FHWO'] = le.fit_transform(df['FHWO'])
label_mapping = dict(zip(le.classes_, range(len(le.classes_))))
print(label_mapping)
df['CAEC'] = le.fit_transform(df['CAEC'])
label_mapping = dict(zip(le.classes_, range(len(le.classes_))))
print(label_mapping)
df['MTRANS'] = le.fit_transform(df['MTRANS'])
label_mapping = dict(zip(le.classes_, range(len(le.classes_))))
print(label_mapping)
df['target'] = le.fit_transform(df['target'])
label_mapping = dict(zip(le.classes_, range(len(le.classes_))))
print(label_mapping)

{'Female': 0, 'Male': 1}
{'Always': 0, 'Frequently': 1, 'Sometimes': 2, 'no': 3}
{'no': 0, 'yes': 1}
{'no': 0, 'yes': 1}
{'no': 0, 'yes': 1}
{'no': 0, 'yes': 1}
{'Always': 0, 'Frequently': 1, 'Sometimes': 2, 'no': 3}
{'Automobile': 0, 'Bike': 1, 'Motorbike': 2, 'Public_Transportation': 3, 'Walking': 4}
{'Insufficient_Weight': 0, 'Normal_Weight': 1, 'Obesity_Type_I': 2, 'Obesity_Type_II': 3, 'Obesity_Type_III': 4, 'Overweight_Level_I': 5, 'Overweight_Level_II': 6}


In [80]:
df.head()

Unnamed: 0,Age,Gender,Height,Weight,CALC,FAVC,FCVC,NCP,SCC,SMOKE,CH2O,FHWO,FAF,TUE,CAEC,MTRANS,target
0,21.0,0,1.62,64.0,3,0,2.0,3.0,0,0,2.0,1,0.0,1.0,2,3,1
1,21.0,0,1.52,56.0,2,0,3.0,3.0,1,1,3.0,1,3.0,0.0,2,3,1
2,23.0,1,1.8,77.0,1,0,2.0,3.0,0,0,2.0,1,2.0,1.0,2,3,1
3,27.0,1,1.8,87.0,1,0,3.0,3.0,0,0,2.0,0,2.0,0.0,2,4,5
4,22.0,1,1.78,89.8,2,0,2.0,1.0,0,0,2.0,0,0.0,0.0,2,3,6


In [71]:
print('maximum values in each column', df.max())
print('minimum values in each column', df.min())

maximum values in each column Age        61.00
Gender      1.00
Height      1.98
Weight    173.00
CALC        3.00
FAVC        1.00
FCVC        3.00
NCP         4.00
SCC         1.00
SMOKE       1.00
CH2O        3.00
FHWO        1.00
FAF         3.00
TUE         2.00
CAEC        3.00
MTRANS      4.00
target      6.00
dtype: float64
minimum values in each column Age       14.00
Gender     0.00
Height     1.45
Weight    39.00
CALC       0.00
FAVC       0.00
FCVC       1.00
NCP        1.00
SCC        0.00
SMOKE      0.00
CH2O       1.00
FHWO       0.00
FAF        0.00
TUE        0.00
CAEC       0.00
MTRANS     0.00
target     0.00
dtype: float64


In [9]:
X = df.drop(columns=['target'])
y = df['target']

In [10]:
X = pd.get_dummies(X, columns=['Gender', 'CALC', 'FAVC', 'SCC', 'SMOKE', 'FHWO', 'CAEC', 'MTRANS'])

In [11]:
X.head()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_0,Gender_1,CALC_0,CALC_1,CALC_2,CALC_3,FAVC_0,FAVC_1,SCC_0,SCC_1,SMOKE_0,SMOKE_1,FHWO_0,FHWO_1,CAEC_0,CAEC_1,CAEC_2,CAEC_3,MTRANS_0,MTRANS_1,MTRANS_2,MTRANS_3,MTRANS_4
0,21.0,1.62,64.0,2.0,3.0,2.0,0.0,1.0,True,False,False,False,False,True,True,False,True,False,True,False,False,True,False,False,True,False,False,False,False,True,False
1,21.0,1.52,56.0,3.0,3.0,3.0,3.0,0.0,True,False,False,False,True,False,True,False,False,True,False,True,False,True,False,False,True,False,False,False,False,True,False
2,23.0,1.8,77.0,2.0,3.0,2.0,2.0,1.0,False,True,False,True,False,False,True,False,True,False,True,False,False,True,False,False,True,False,False,False,False,True,False
3,27.0,1.8,87.0,3.0,3.0,2.0,2.0,0.0,False,True,False,True,False,False,True,False,True,False,True,False,True,False,False,False,True,False,False,False,False,False,True
4,22.0,1.78,89.8,2.0,1.0,2.0,0.0,0.0,False,True,False,False,True,False,True,False,True,False,True,False,True,False,False,False,True,False,False,False,False,True,False


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_scaled = scaler.transform(X)

In [14]:
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

In [15]:
input_size = X_train.shape[1]
output_size = len(np.unique(y_train))
model = Model(input_size, output_size)

In [16]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [17]:
epochs = 200
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item()}')

Epoch [10/200], Loss: 1.8939002752304077
Epoch [20/200], Loss: 1.810104489326477
Epoch [30/200], Loss: 1.7037278413772583
Epoch [40/200], Loss: 1.567833662033081
Epoch [50/200], Loss: 1.4002659320831299
Epoch [60/200], Loss: 1.2202296257019043
Epoch [70/200], Loss: 1.064832091331482
Epoch [80/200], Loss: 0.944577157497406
Epoch [90/200], Loss: 0.8510631322860718
Epoch [100/200], Loss: 0.7753252387046814
Epoch [110/200], Loss: 0.7093310356140137
Epoch [120/200], Loss: 0.6496623754501343
Epoch [130/200], Loss: 0.5944066643714905
Epoch [140/200], Loss: 0.542854368686676
Epoch [150/200], Loss: 0.4952511787414551
Epoch [160/200], Loss: 0.4512709081172943
Epoch [170/200], Loss: 0.4112716019153595
Epoch [180/200], Loss: 0.3748059868812561
Epoch [190/200], Loss: 0.34142348170280457
Epoch [200/200], Loss: 0.310802698135376


In [18]:
with torch.no_grad():
    model.eval()
    outputs = model(X_test_tensor)
    predicted = torch.argmax(outputs, 1)
    accuracy = (predicted == y_test_tensor).sum().item() / y_test.shape[0]
    print(f'Test Accuracy: {accuracy}')

Test Accuracy: 0.8368794326241135


In [19]:
model_path = 'pytorch_model.pth'
torch.save(model.state_dict(), model_path)

In [81]:
d = DataInterface(dataframe=df, target='target', continuous_features=['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE'])
m = ModelInterface(model_path='../DiPACE/pytorch_model.pth', input_size = input_size, output_size=output_size)

In [144]:
df = df[df['target']==2]
query_index = np.random.choice(len(df))
query_instance = df.iloc[132].drop('target').to_list()
print(query_index)

43


In [145]:
exp = GradCFA(d, m)
cf = exp.generate_counterfactuals(query_instance, total_CFs=5, high_loss_threshold=1.2, desired_value=1, problem_type='multiclass', immutable_features=['NCP'])

Desired class: 1
Original class 2
All CFs are classified as the desired class.
Final Loss: tensor(2.0913, grad_fn=<SubBackward0>)
Final Pred Loss: tensor(0.5838, grad_fn=<MeanBackward0>)
Final Proximity Loss: tensor(0.1033, grad_fn=<MeanBackward0>)
Final Diversity Loss: tensor(0.4963, grad_fn=<SubBackward0>)
Final Sparsity Loss: tensor(0.7310)
Final Plausibility Loss: tensor(2.6763, grad_fn=<AddBackward0>)
Unacceptably high loss. Perturbing relevant features.
All CFs are classified as the desired class.
Final Loss: tensor(1.6797, grad_fn=<SubBackward0>)
Final Pred Loss: tensor(0.5071, grad_fn=<MeanBackward0>)
Final Proximity Loss: tensor(0.1755, grad_fn=<MeanBackward0>)
Final Diversity Loss: tensor(0.7099, grad_fn=<SubBackward0>)
Final Sparsity Loss: tensor(0.6600)
Final Plausibility Loss: tensor(2.2194, grad_fn=<AddBackward0>)
Unacceptably high loss. Perturbing relevant features.


KeyboardInterrupt: 

In [None]:
query_instance, cf_instances = cf
query_instance = torch.Tensor(query_instance).float()
cf_instances = torch.Tensor(cf_instances).float()

In [None]:
eval = Evaluate(d, m)
print(eval.evaluate_confidence_multiclass(cf_instances, desired_class=1, num_classes=output_size))

Mean confidence: tensor(5.0040, grad_fn=<MeanBackward0>)
tensor(0.7149, grad_fn=<DivBackward0>)


In [None]:
query_instance = pd.DataFrame(query_instance, columns=X.columns)
cf_instances = pd.DataFrame(cf_instances, columns=X.columns)
query_instance = query_instance.drop(query_instance.filter(regex='^MTRANS_').columns, axis=1)
cf_instances = cf_instances.drop(cf_instances.filter(regex='^MTRANS_').columns, axis=1)
observed_instances = d.norm_encoded_features
observed_instances = observed_instances.drop(observed_instances.filter(regex='^MTRANS_').columns, axis=1)
query_instance = torch.Tensor(np.array(query_instance)).float()
cf_instances = torch.Tensor(np.array(cf_instances)).float()

In [None]:
print(eval.evaluate_proximity(query_instance, cf_instances))
print(eval.evaluate_sparsity(query_instance, cf_instances))
print(eval.evaluate_plausibility(cf_instances, observed_instances, k=1))
print(eval.evaluate_diversity(cf_instances))

tensor(0.2642)
tensor(0.3385)
tensor(1.4770)
tensor(0.8079)
