In [11]:
import pandas as pd

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

### Load the data

In [None]:
data = pd.read_csv("/content/alzheimers_disease_data.csv")
data = data.drop(
    "DoctorInCharge", axis=1
)  # this attribute is confidential in the data, and thus not useful
eth_col = data.pop("Ethnicity")
data.insert(3, "Ethnicity", eth_col)

data.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,6.518877,0,0,1.725883,0,0,0,1,0,0
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,7.118696,0,0,2.592424,0,0,0,0,1,0
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,5.895077,0,0,7.119548,0,1,0,1,0,0
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,8.965106,0,1,6.481226,0,0,0,0,0,0
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,6.045039,0,0,0.014691,0,0,1,1,0,0


In [None]:
data.isna().sum() # check for missing values

Unnamed: 0,0
PatientID,0
Age,0
Gender,0
Ethnicity,0
EducationLevel,0
BMI,0
Smoking,0
AlcoholConsumption,0
PhysicalActivity,0
DietQuality,0


In [None]:
len(data)

2149

## CGAN Model

In [None]:
# %pip install sdv
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import Metadata

metadata = Metadata.detect_from_dataframe(
    data = data, table_name="patient_data")

synthesizer = CTGANSynthesizer(metadata, epochs=1000, verbose=True, batch_size=2100, generator_lr=0.05)
synthesizer.fit(data)


We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.

Gen. (1.44) | Discrim. (-1.86): 100%|██████████| 1000/1000 [01:16<00:00, 13.08it/s]


In [None]:
fig = synthesizer.get_loss_values_plot()
fig.show()

In [None]:
synthesizer.save(
    filepath='cgan_synthesizer.pkl'
)

In [None]:
rows_eth1 = int(data["Ethnicity"].value_counts()[0] - data["Ethnicity"].value_counts()[1])
rows_eth2 = int(data["Ethnicity"].value_counts()[0] - data["Ethnicity"].value_counts()[2])
rows_eth3 = int(data["Ethnicity"].value_counts()[0] - data["Ethnicity"].value_counts()[3])

In [None]:
from sdv.sampling import Condition

ethnicity1 = Condition(num_rows=rows_eth1 , column_values={'Ethnicity': 1})
ethnicity2 = Condition(num_rows=10, column_values={'Ethnicity': 2})
ethnicity3 = Condition(num_rows=10, column_values={'Ethnicity': 3})

synthetic_data_1 = synthesizer.sample_from_conditions(conditions=[ethnicity1], max_tries_per_batch=200)
synthetic_data_2 = synthesizer.sample_from_conditions(conditions=[ethnicity2], max_tries_per_batch=200)
synthetic_data_3 = synthesizer.sample_from_conditions(conditions=[ethnicity3], max_tries_per_batch=200)

Sampling conditions: 100%|██████████| 824/824 [00:00<00:00, 1561.05it/s]
Sampling conditions: 100%|██████████| 10/10 [00:00<00:00, 18.23it/s]
Sampling conditions: 100%|██████████| 10/10 [00:00<00:00, 27.48it/s]


In [None]:
data_with_synthetic = pd.concat([data, synthetic_data_1, synthetic_data_2, synthetic_data_3], ignore_index=True)
data_with_synthetic.to_csv("data_with_synthetic.csv")

------------ Part above ran in Google Collab -------------

------------ Part below ran locally -------------

In [1]:
import pandas as pd
data = pd.read_csv("../data/alzheimers_disease_data_cgan.csv")

### Split the data

In [2]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split


X, y = data.drop("Diagnosis", axis=1), data["Diagnosis"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=22, stratify=y
)
X_train.head()

Unnamed: 0.1,Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,...,MMSE,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness
1867,1867,6618,89,0,0,2,29.642299,1,3.089484,7.01351,...,16.335329,1.52462,0,1,5.911841,0,0,0,0,0
112,112,4863,85,0,2,1,33.292765,0,14.658966,1.032838,...,16.158151,7.718373,0,0,2.114117,0,0,0,1,1
1183,1183,5934,76,0,2,1,20.279534,0,8.650726,4.546652,...,26.863965,9.934841,0,0,9.683287,0,0,0,0,1
2349,2349,11026545,72,0,1,2,39.992767,0,19.989293,5.444098,...,0.005312,9.322267,0,0,9.999747,0,0,0,0,0
44,44,4795,82,0,0,1,19.525593,0,13.392876,4.581855,...,6.245617,4.874718,0,0,5.296203,0,0,0,0,1


In [3]:
classifier = HistGradientBoostingClassifier(random_state=12)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97       527
           1       0.94      0.93      0.94       222

    accuracy                           0.96       749
   macro avg       0.96      0.95      0.96       749
weighted avg       0.96      0.96      0.96       749



In [4]:
# Join X_test, y_test, y_pred for analysis
results = X_test.copy()
results["TrueDiagnosis"] = y_test
results["PredictedDiagnosis"] = y_pred

In [5]:
# Ethnicity 0
eth0_results = results[results["Ethnicity"] == 0]
print("Ethnicity 0 Results:")
print(
    classification_report(
        eth0_results["TrueDiagnosis"], eth0_results["PredictedDiagnosis"]
    )
)

Ethnicity 0 Results:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97       214
           1       0.93      0.93      0.93       107

    accuracy                           0.95       321
   macro avg       0.95      0.95      0.95       321
weighted avg       0.95      0.95      0.95       321



In [6]:
# Ethnicity 1
eth1_results = results[results["Ethnicity"] == 1]
print("Ethnicity 1 Results:")
print(
    classification_report(
        eth1_results["TrueDiagnosis"], eth1_results["PredictedDiagnosis"]
    )
)

Ethnicity 1 Results:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       244
           1       0.92      0.95      0.93        74

    accuracy                           0.97       318
   macro avg       0.95      0.96      0.96       318
weighted avg       0.97      0.97      0.97       318



In [7]:
# Ethnicity 2
eth2_results = results[results["Ethnicity"] == 2]
print("Ethnicity 2 Results:")
print(
    classification_report(
        eth2_results["TrueDiagnosis"], eth2_results["PredictedDiagnosis"]
    )
)

Ethnicity 2 Results:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        28
           1       1.00      0.96      0.98        23

    accuracy                           0.98        51
   macro avg       0.98      0.98      0.98        51
weighted avg       0.98      0.98      0.98        51



In [8]:
# Ethnicity 3
eth3_results = results[results["Ethnicity"] == 3]
print("Ethnicity 3 Results:")
print(
    classification_report(
        eth3_results["TrueDiagnosis"], eth3_results["PredictedDiagnosis"]
    )
)

Ethnicity 3 Results:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98        41
           1       1.00      0.89      0.94        18

    accuracy                           0.97        59
   macro avg       0.98      0.94      0.96        59
weighted avg       0.97      0.97      0.97        59



In [11]:
from fairness_metrics import (
    demographic_parity,
    equal_opportunity,
    disparate_impact,
    equalized_odds,
)

# DI >= 0.8 is a pre-established threshold for fairness
# DP and EO need to be as close to 0 as possible

for i in [1, 2, 3]:
    dp = round(demographic_parity(results, 0, i), 4)
    eo = round(equal_opportunity(results, 0, i), 4)
    eod = equalized_odds(results, 0, i)
    di = round(disparate_impact(results, 0, i), 4)
    print(dp, eod, di)

0.0912 [0.020712301086132867, -0.008120116439405543] 1.3817
0.1012 [0.031288094270621736, -0.03271028037383177] 0.7655
0.059 [-0.036344755970924236, -0.03271028037383177] 1.2177
