In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.inspection import permutation_importance
from sklearn.datasets import load_diabetes

from scipy.linalg import cholesky, det
import seaborn as sns
import matplotlib.pyplot as plt

seed = 1
np.random.seed(seed)

In [None]:
!git clone https://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@github.gatech.edu/ngong6/VIP-2022.git
%cd VIP-2022/
%ls

Cloning into 'VIP-2022'...
remote: Enumerating objects: 5, done.[K
remote: Total 5 (delta 0), reused 0 (delta 0), pack-reused 5[K
Unpacking objects: 100% (5/5), done.
/content/VIP-2022
BRFSS2015_diabetes_012.csv  BRFSS2015_diabetes_binary.csv  pima_diabetes.csv


In [None]:
def covar_synthetic_binary(df, N, target, round_dict=None):
  if N == 0: return pd.DataFrame(columns=df.columns)
  n_samples = N // 2
  D = df.shape[1] - 1
  mean_vector = df.mean()
  # Compute covariance matrices
  cov1 = np.cov(df[df[target] == 1].drop(columns=target).T)
  cov0 = np.cov(df[df[target] == 0].drop(columns=target).T)
  # Compute N synthetic samples
  synth1 = np.random.normal(size=(n_samples,D)) @ cholesky(cov1)
  synth0 = np.random.normal(size=(n_samples,D)) @ cholesky(cov0)
  # Convert to DataFrame with feature names
  # Scale all features to be non-negative
  df_synth1 = pd.DataFrame(synth1, columns=df.columns.drop(target))
  df_synth0 = pd.DataFrame(synth0, columns=df.columns.drop(target))
  df_synth1 = df_synth1.add(abs(df_synth1.min()))
  df_synth0 = df_synth0.add(mean_vector)
  if round_dict:
    df_synth1 = df_synth1.round(round_dict)
    df_synth0 = df_synth0.round(round_dict)
  # Append outcomes
  df_synth1[target] = np.repeat(1, len(df_synth1))
  df_synth0[target] = np.repeat(0, len(df_synth0))

  df_synth = df_synth1.append(df_synth0)
  df_synth = df_synth.reset_index(drop=True)
  return df_synth 

def covar_synthetic(df, N):
  if N == 0: return pd.DataFrame(columns=df.columns)
  n_samples = N
  D = df.shape[1]
  mean_vector = df.mean()
  # Compute covariance matrices
  cov = np.cov(df)
  # Compute N synthetic samples
  synth = np.random.normal(size=(n_samples,D)) @ cholesky(cov)
  # Convert to DataFrame with feature names
  # Scale all features to be non-negative
  df_synth = pd.DataFrame(synth, columns=df.columns)
  df_synth = df_synth.add(mean_vector)
  df_synth = df_synth.reset_index(drop=True)
  return df_synth

In [None]:
df_brfss = pd.read_csv("BRFSS2015_diabetes_binary.csv")
temp = df_brfss[df_brfss["Diabetes_binary"] == 0]
temp1 = df_brfss[df_brfss["Diabetes_binary"] == 1]
df_brfss = temp.sample(len(temp1)).append(temp1)

df_pima = pd.read_csv("pima_diabetes.csv")
sklearn = load_diabetes(as_frame=True)
df_sklearn = pd.concat([sklearn.data, sklearn.target], axis=1)

print(
    f"BRFSS shape: {df_brfss.shape}\n"
    f"Pima shape: {df_pima.shape}\n"
)
df_pima.columns

BRFSS shape: (70692, 22)
Pima shape: (768, 9)



Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [None]:
d_brfss = dict.fromkeys(df_brfss.columns, 1)
d_pima = {
  "Pregnancies": 0,
  "Glucose": 0,
  "BloodPressure": 0,
  "SkinThickness": 0,
  "Insulin": 0,
  "BMI": 1,
  "DiabetesPedigreeFunction": 3,
  "Age": 0,
  "Outcome": 0
}

In [None]:
def train_test(df_real, df_synth, target):
  X = df_real.drop(columns=target).values
  X_synth = df_synth.drop(columns=target).values
  y = df_real[target]
  y_synth = df_synth[target]


  rf = RandomForestClassifier(random_state=seed)
  svc = SVC()
  X_train, y_train = X_synth, y_synth
  rf.fit(X_train, y_train)
  svc.fit(X_train, y_train)
  print("Training on synthetic data only...")
  print(f"RF accuracy: {rf.score(X, y):.3f}")
  print(f"SVC accuracy: {svc.score(X, y):.3f}")
  print(f"RF f1: {f1_score(y, rf.predict(X)):.3f}")
  print(f"SVC f1: {f1_score(y, svc.predict(X)):.3f}")

  # Train on real + synthetic
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
  X_train = np.append(X_train, X_synth, axis=0)
  y_train = np.append(y_train, y_synth, axis=0)

  rf.fit(X_train, y_train)
  svc.fit(X_train, y_train)
  print("Training on synthetic + real data...")
  print(f"RF accuracy: {rf.score(X_test, y_test):.3f}")
  print(f"SVC accuracy: {svc.score(X_test, y_test):.3f}")
  print(f"RF f1: {f1_score(y_test, rf.predict(X_test)):.3f}")
  print(f"SVC f1: {f1_score(y_test, svc.predict(X_test)):.3f}")

  # print(f"F1 score: {f1_score(y_test, clf.predict(X_test)):.3f}")
  # print(permutation_importance(clf, X_test, y_test, n_repeats=10, random_state=seed).importances_mean)

In [None]:
synth_brfss = covar_synthetic_binary(df_brfss, len(df_brfss), "Diabetes_binary", d_brfss)
synth_pima = covar_synthetic_binary(df_pima, len(df_pima), "Outcome", d_pima)
# synth_sklearn = covar_synthetic(df_sklearn, len(df_sklearn))

print(
    f"BRFSS synthetic shape: {synth_brfss.shape}\n"
    f"Pima synthetic shape: {synth_pima.shape}"
)

BRFSS synthetic shape: (70692, 22)
Pima synthetic shape: (768, 9)


In [None]:
print("Pima")
train_test(df_pima, synth_pima, "Outcome")
print("\nBRFSS")
train_test(df_brfss, synth_brfss, "Diabetes_binary")

Pima
Training on synthetic data only...
RF accuracy: 0.656
SVC accuracy: 0.651
RF f1: 0.090
SVC f1: 0.113
Training on synthetic + real data...
RF accuracy: 0.771
SVC accuracy: 0.628
RF f1: 0.602
SVC f1: 0.104

BRFSS
Training on synthetic data only...
RF accuracy: 0.500
SVC accuracy: 0.501
RF f1: 0.000
SVC f1: 0.005


KeyboardInterrupt: ignored