In [1]:
import pandas as pd # for data manipulation 
import networkx as nx # for drawing graphs
import matplotlib.pyplot as plt # for drawing graphs
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.datasets import make_classification
from sklearn.svm import SVC
from sklearn.datasets import load_diabetes
from pandas import read_csv
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder



# for creating Bayesian Belief Networks (BBN)
# !pip install pybbn
# from pybbn.graph.dag import Bbn
# from pybbn.graph.edge import Edge, EdgeType
# from pybbn.graph.jointree import EvidenceBuilder
# from pybbn.graph.node import BbnNode
# from pybbn.graph.variable import Variable
# from pybbn.pptc.inferencecontroller import InferenceController

# !pip install tsBNgen
# from tsBNgen import *
# from tsBNgen.tsBNgen import *



seed = 1
np.random.seed(seed)

In [2]:
url_pima = "https://github.gatech.edu/raw/jwoo71/VIP/main/pima.csv?token=GHSAT0AAAAAAAACP5IUDEQF5I4RFPV2QZ4CY4PF6RA"
url_brfss = "https://github.gatech.edu/raw/jwoo71/VIP/main/diabetes_binary_health_indicators_BRFSS2015.csv?token=GHSAT0AAAAAAAACP5IVKEWLWRKRFEGMYPUWY4PF6YQ"
df_pima = pd.read_csv(url_pima)
df_pima.head()
df_brfss = pd.read_csv(url_brfss)
df_brfss.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [3]:
def bayesian_synth(df, target):
  x = df.drop(columns=target).values
  y = df[target]
  y = LabelEncoder().fit_transform(y)
  test_size=0.2
  x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=test_size,random_state=42)
  NB = GaussianNB()
  accuraccies = cross_val_score(estimator = NB, X= x_train, y=y_train, cv=35)
  print(np.mean(accuraccies))
  print(np.std(accuraccies))
  NB.fit(x_train, y_train)
  NBscore=NB.score(x_test,y_test)
  x = x*NBscore
  df_synth = pd.DataFrame(x)
  df_synth[target] = y
  return df_synth

In [4]:
def train_test(df_real, df_synth, target):
  X = df_real.drop(columns=target).values
  X_synth = df_synth.drop(columns=target).values
  y = df_real[target]
  y_synth = df_synth[target]

  rf = RandomForestClassifier(random_state=seed)
  svc = SVC()
  X_train, y_train = X_synth, y_synth
  rf.fit(X_train, y_train)
  svc.fit(X_train, y_train)
  print("Training only synthetic:")
  print(f"RF acc: {rf.score(X,y):.3f}")
  print(f"SVC accuracy: {svc.score(X, y):.3f}")
  print(f"RF f1: {f1_score(y, rf.predict(X)):.3f}")
  print(f"SVC f1: {f1_score(y, svc.predict(X)):.3f}")

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
  X_train = np.append(X_train, X_synth, axis=0)
  y_train = np.append(y_train, y_synth, axis=0)

  rf.fit(X_train, y_train)
  svc.fit(X_train, y_train)
  print("Training on synthetic + real data...")
  print(f"RF accuracy: {rf.score(X_test, y_test):.3f}")
  print(f"SVC accuracy: {svc.score(X_test, y_test):.3f}")
  print(f"RF f1: {f1_score(y_test, rf.predict(X_test)):.3f}")
  print(f"SVC f1: {f1_score(y_test, svc.predict(X_test)):.3f}")
  print("\n\n")

In [5]:
synth_pima = bayesian_synth(df_pima, "Outcome")
synth_pima.describe()

synth_brfss = bayesian_synth(df_brfss, "Diabetes_binary")
synth_brfss.describe()

0.734733893557423
0.09438081048016994
0.7737700048998502
0.005268975524879579


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,Diabetes_binary
count,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,...,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0
mean,0.331187,0.32742,0.743177,21.911077,0.342124,0.031321,0.072711,0.584049,0.489643,0.626413,...,0.064984,1.938785,2.458632,3.27487,0.129868,0.339942,6.200766,3.898916,4.673569,0.139333
std,0.382088,0.381528,0.146348,5.101887,0.383497,0.15231,0.22549,0.331317,0.371824,0.301986,...,0.214347,0.82486,5.72269,6.730225,0.288777,0.383241,2.357847,0.761014,1.598918,0.346294
min,0.0,0.0,0.0,9.263955,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.771996,0.0,0.0,0.0,0.0,0.771996,0.771996,0.771996,0.0
25%,0.0,0.0,0.771996,18.527909,0.0,0.0,0.0,0.771996,0.0,0.771996,...,0.0,1.543992,0.0,0.0,0.0,0.0,4.631977,3.087985,3.859981,0.0
50%,0.0,0.0,0.771996,20.843898,0.0,0.0,0.0,0.771996,0.771996,0.771996,...,0.0,1.543992,0.0,0.0,0.0,0.0,6.17597,3.859981,5.403974,0.0
75%,0.771996,0.771996,0.771996,23.931883,0.771996,0.0,0.0,0.771996,0.771996,0.771996,...,0.0,2.315989,1.543992,2.315989,0.0,0.771996,7.719962,4.631977,6.17597,0.0
max,0.771996,0.771996,0.771996,75.655629,0.771996,0.771996,0.771996,0.771996,0.771996,0.771996,...,0.771996,3.859981,23.159886,23.159886,0.771996,0.771996,10.035951,4.631977,6.17597,1.0


In [None]:
print("PIMA")
train_test(df_pima, synth_pima, "Outcome")

print("BRFSS")
train_test(df_brfss, synth_brfss, "Diabetes_binary")


PIMA
Training only synthetic:
RF acc: 0.564
SVC accuracy: 0.642
RF f1: 0.604
SVC f1: 0.638
Training on synthetic + real data...
RF accuracy: 0.775
SVC accuracy: 0.775
RF f1: 0.705
SVC f1: 0.658



BRFSS
