In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.inspection import permutation_importance

from scipy.linalg import cholesky, det
import seaborn as sns
import matplotlib.pyplot as plt

seed = 1
np.random.seed(seed)

# Baseline classification

In [None]:
!git clone https://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@github.gatech.edu/ngong6/VIP-2022.git
%cd VIP-2022/
%ls

Cloning into 'VIP-2022'...
remote: Enumerating objects: 5, done.[K
remote: Total 5 (delta 0), reused 0 (delta 0), pack-reused 5[K
Unpacking objects: 100% (5/5), done.
/content/VIP-2022
BRFSS2015_diabetes_012.csv  BRFSS2015_diabetes_binary.csv  pima_diabetes.csv


In [None]:
df = pd.read_csv("pima_diabetes.csv")

# Remove all rows with zero insulin.
df = df[df.Insulin != 0]
print(f"Data shape: {df.shape}")
df.describe()

Data shape: (394, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,394.0,394.0,394.0,394.0,394.0,394.0,394.0,394.0,394.0
mean,3.286802,122.304569,70.654822,29.106599,155.548223,32.988579,0.525543,30.814721,0.329949
std,3.209635,31.396725,12.469919,10.504273,118.775855,7.21016,0.350127,10.198971,0.470792
min,0.0,0.0,24.0,7.0,14.0,0.0,0.085,21.0,0.0
25%,1.0,99.0,62.0,21.0,76.25,28.325,0.27025,23.0,0.0
50%,2.0,119.0,70.0,29.0,125.0,33.2,0.4495,27.0,0.0
75%,5.0,143.0,78.0,36.75,190.0,37.075,0.687,36.0,1.0
max,17.0,198.0,110.0,63.0,846.0,67.1,2.42,81.0,1.0


In [None]:
# Get features by dropping the target Outcome column
X = df.drop(columns="Outcome")
# Get labels from the Outcome column
y = df.Outcome

# Split features and labels into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)
# Use DecisionTreeClassifier for classification
clf = DecisionTreeClassifier(random_state=seed)

clf.fit(X_train, y_train)
print(f"Accuracy: {clf.score(X_test, y_test):.3f}")
print(f"F1 score: {f1_score(y_test, clf.predict(X_test)):.3f}")
print(permutation_importance(clf, X_test, y_test, n_repeats=10, random_state=seed).importances_mean)

Accuracy: 0.768
F1 score: 0.610
[0.04343434 0.11313131 0.0010101  0.06868687 0.02626263 0.1030303
 0.00505051 0.05858586]


# Naive dataset
Generated based on covariance matrix.

In [None]:
def covar_synthetic(df, N):
  if N == 0: return pd.DataFrame(columns=df.columns)
  n_samples = N // 2
  D = df.shape[1] - 1
  mean_vector = df.mean()
  round_dict = {
      "Pregnancies": 0,
      "Glucose": 0,
      "BloodPressure": 0,
      "SkinThickness": 0,
      "Insulin": 0,
      "BMI": 1,
      "DiabetesPedigreeFunction": 3,
      "Age": 0,
      "Outcome": 0
  }
  # Compute covariance matrices
  cov1 = np.cov(df[df.Outcome == 1].drop(columns="Outcome").T)
  cov0 = np.cov(df[df.Outcome == 0].drop(columns="Outcome").T)
  # Compute N synthetic samples
  synth1 = np.random.normal(size=(n_samples,D)) @ cholesky(cov1)
  synth0 = np.random.normal(size=(n_samples,D)) @ cholesky(cov0)
  # Convert to DataFrame with feature names
  # Scale all features to be non-negative
  df_synth1 = pd.DataFrame(synth1, columns=df.columns.drop("Outcome"))
  df_synth1 = df_synth1.add(abs(df_synth1.min())).round(round_dict)
  df_synth0 = pd.DataFrame(synth0, columns=df.columns.drop("Outcome"))
  df_synth0 = df_synth0.add(mean_vector).round(round_dict)
  # Append outcomes
  df_synth1["Outcome"] = np.repeat(1, len(df_synth1))
  df_synth0["Outcome"] = np.repeat(0, len(df_synth0))

  df_synth = df_synth1.append(df_synth0)
  df_synth = df_synth.reset_index(drop=True)
  return df_synth 


Train classifier on real data combined with synthetic:

In [None]:
scores = []
N_range = range(0, len(df), 20)
# N_range = range(0, 50)

for N in N_range:
  for i in range(50):
    df_synth = covar_synthetic(df, N)
    scores.append(train_and_score(df_synth, df))

NameError: ignored

In [None]:
df_scores = pd.DataFrame(scores, columns=["n_synthetic", "Accuracy", "F1_score"])
df_scores = df_scores.melt(id_vars="n_synthetic")
plt.figure(figsize=(10,5))
plot = sns.lineplot(data=df_scores, x="n_synthetic", y="value", hue="variable")
plot.axhline(0.768, linewidth=.5, color="black")
plot.axhline(0.610, linewidth=.5, color="black")

Generative Adverserial Networks

In [None]:
pip install ctgan

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.inspection import permutation_importance
from scipy.linalg import cholesky, det
import seaborn as sns
import matplotlib.pyplot as plt
#from google.colab import drive
#drive.mount('drive.google.com/drive/folders/1202Z6CP_R0b8NyIO6S2K8Nkc4ma5wgQE', force_remount=True)
#import os
#path = "/content/gdrive/MyDrive/VIP - Synthetic Patient Models and Data-20221105T222810Z-001/VIP - Synthetic Patient Models and Data"
#dir_list = os.listdir(path)
#print(dir_list)

data_path = "/content/gdrive/MyDrive/VIP - Synthetic Patient Models and Data-20221105T222810Z-001/VIP - Synthetic Patient Models and Data/pima_diabetes.csv"
real_data = pd.read_csv(data_path)
#print(real_data.head(5))
real_data = real_data[real_data.Insulin!=0]
#print(real_data.head(5))
from ctgan import CTGAN
# Identifies all the discrete columns
discrete_columns = ['Pregnancies',
                   'Glucose',
                   'BloodPressure',
                   'SkinThickness',
                   'Insulin',
                   'BMI',
                   'DiabetesPedigreeFunction',
                   'Age',
                   'Outcome'
                   ]
# Initiates the CTGANSynthesizer and call its fit method to pass in the table
 
ctgan = CTGAN(epochs=10)
ctgan.fit(real_data, discrete_columns)
#generate synthetic data, 1000 rows of data

synthetic_data = ctgan.sample(1000)
print(synthetic_data.head(5))
# Get features by dropping the target Outcome column
X = synthetic_data.drop(columns="Outcome")
# Get labels from the Outcome column
y = synthetic_data.Outcome

# Split features and labels into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)
# Use DecisionTreeClassifier for classification
clf = DecisionTreeClassifier(random_state=seed)

clf.fit(X_train, y_train)
print(f"Accuracy: {clf.score(X_test, y_test):.3f}")
print(f"F1 score: {f1_score(y_test, clf.predict(X_test)):.3f}")
print(permutation_importance(clf, X_test, y_test, n_repeats=10, random_state=seed).importances_mean)

FileNotFoundError: ignored

In [None]:
pip install table-evaluator

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting table-evaluator
  Downloading table_evaluator-1.4.2-py3-none-any.whl (20 kB)
Collecting seaborn<=0.11.1
  Downloading seaborn-0.11.1-py3-none-any.whl (285 kB)
[K     |████████████████████████████████| 285 kB 5.2 MB/s 
Collecting dython==0.5.1
  Downloading dython-0.5.1-py3-none-any.whl (14 kB)
Installing collected packages: seaborn, dython, table-evaluator
  Attempting uninstall: seaborn
    Found existing installation: seaborn 0.11.2
    Uninstalling seaborn-0.11.2:
      Successfully uninstalled seaborn-0.11.2
Successfully installed dython-0.5.1 seaborn-0.11.1 table-evaluator-1.4.2


In [None]:
from table_evaluator import load_data, TableEvaluator
print(real_data.head(5))
table_evaluator = TableEvaluator(real_data, synthetic_data)
table_evaluator.visual_evaluation()


In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.inspection import permutation_importance

from scipy.linalg import cholesky, det
import seaborn as sns
import matplotlib.pyplot as plt

seed = 1
np.random.seed(seed)

In [None]:
# Get features by dropping the target Outcome column
X = synthetic_data.drop(columns="Outcome")
# Get labels from the Outcome column
y = synthetic_data.Outcome

# Split features and labels into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)
# Use DecisionTreeClassifier for classification
clf = DecisionTreeClassifier(random_state=seed)

clf.fit(X_train, y_train)
print(f"Accuracy: {clf.score(X_test, y_test):.3f}")
print(f"F1 score: {f1_score(y_test, clf.predict(X_test)):.3f}")
print(permutation_importance(clf, X_test, y_test, n_repeats=10, random_state=seed).importances_mean)

Baseline sklearn

In [None]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.inspection import permutation_importance
seed = 1
import pandas as pd
from sklearn import metrics

np.random.seed(seed)
from google.colab import drive

drive.mount('/content/gdrive/', force_remount=True)



In [None]:

data_path = "/content/gdrive/MyDrive/VIP - Synthetic Patient Models and Data-20221105T222810Z-001/VIP - Synthetic Patient Models and Data/scikitLearn.csv"
df= pd.read_csv(data_path)
df.head()
x = df.drop(['Y'], axis=1)
x.head()
y = df.Y
y
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=1)
model = DecisionTreeClassifier()
model= model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred)*100)
confusion_matrix(y_test, y_pred)






GAN sklearn synthetic data