In [17]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, balanced_accuracy_score

In [18]:
real_data_path = '../datasets/breast.csv'
synthetic_data_path = '../synthetic/GCSbc.csv'
target_variable = 'Diagnosis'

In [19]:
real_data = pd.read_csv(real_data_path)
real_data[target_variable] = real_data[target_variable].map({'M':0, 'B':1}) # M = malignant, B = benign
#real_data = real_data.fillna(real_data.mean())

In [20]:
synthetic_data = pd.read_csv(synthetic_data_path)
synthetic_data[target_variable] = synthetic_data[target_variable].map({'M':0, 'B':1}) # M = malignant, B = benign
#real_data = real_data.fillna(real_data.mean())

In [21]:
print('Missing values in real data:      ', real_data.isna().sum().sum())
print('Missing values in synthetic data: ', synthetic_data.isna().sum().sum())
print('Duplicated rows in real data:     ', real_data.duplicated().sum())
print('Duplicated rows in synthetic data:', synthetic_data.duplicated().sum())

Missing values in real data:       0
Missing values in synthetic data:  0
Duplicated rows in real data:      0
Duplicated rows in synthetic data: 0


### Train model in real data and evaluate in real

In [22]:
X = real_data.drop(columns=[target_variable])
y = real_data[target_variable]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [23]:
log_reg = LogisticRegression(solver='lbfgs', max_iter=10000)
log_reg.fit(X_train, y_train)

y_pred_log_reg = log_reg.predict(X_test)

print("Accuracy:         ", accuracy_score(y_test, y_pred_log_reg))
print("Balanced Accuracy:", balanced_accuracy_score(y_test, y_pred_log_reg))
print("Classification Report:\n", classification_report(y_test, y_pred_log_reg))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log_reg))

Accuracy:          0.9766081871345029
Balanced Accuracy: 0.9748677248677249
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97        63
           1       0.98      0.98      0.98       108

    accuracy                           0.98       171
   macro avg       0.97      0.97      0.97       171
weighted avg       0.98      0.98      0.98       171

Confusion Matrix:
 [[ 61   2]
 [  2 106]]


### Train model in synthetic data and evaluate in real

In [24]:
X_train = synthetic_data.drop(columns=[target_variable])
y_train = synthetic_data[target_variable]
X_test = real_data.drop(columns=[target_variable])
y_test = real_data[target_variable]

#### Logistic Regression

In [25]:
log_reg = LogisticRegression(solver='lbfgs', max_iter=10000)
log_reg.fit(X_train, y_train)

y_pred_log_reg = log_reg.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("Balanced Accuracy:", balanced_accuracy_score(y_test, y_pred_log_reg))
print("Classification Report:\n", classification_report(y_test, y_pred_log_reg))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log_reg))

Accuracy: 0.9103690685413005
Balanced Accuracy: 0.8845066328418161
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.78      0.87       212
           1       0.88      0.99      0.93       357

    accuracy                           0.91       569
   macro avg       0.93      0.88      0.90       569
weighted avg       0.92      0.91      0.91       569

Confusion Matrix:
 [[166  46]
 [  5 352]]


#### Random Forest

In [26]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Balanced Accuracy:", balanced_accuracy_score(y_test, y_pred_log_reg))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

Accuracy: 0.9068541300527241
Balanced Accuracy: 0.8845066328418161
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.78      0.86       212
           1       0.88      0.98      0.93       357

    accuracy                           0.91       569
   macro avg       0.92      0.88      0.90       569
weighted avg       0.91      0.91      0.90       569

Confusion Matrix:
 [[165  47]
 [  6 351]]


### Train model in real data and evaluate in synthetic

Consistency: If the model trained on the original data performs well on the synthetic data, it indicates that the synthetic data captures the underlying structure of the original data. This consistency suggests that the synthetic data maintains similar relationships and distributions as the original data.

Generalizability: If the model trained on the original data generalizes well to the synthetic data, it suggests that the synthetic data is of high quality in terms of replicating the real-world phenomena represented in the original data.

In [27]:
X_train = real_data.drop(columns=[target_variable])
y_train = real_data[target_variable]
X_test = synthetic_data.drop(columns=[target_variable])
y_test = synthetic_data[target_variable]

rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Balanced Accuracy:", balanced_accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

Accuracy: 0.708
Balanced Accuracy: 0.7116666666666667
Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.73      0.67       400
           1       0.79      0.69      0.74       600

    accuracy                           0.71      1000
   macro avg       0.70      0.71      0.70      1000
weighted avg       0.72      0.71      0.71      1000

Confusion Matrix:
 [[292 108]
 [184 416]]


### Train a binary classifier to distinguish synthetic from real data

Create unified dataset

In [28]:
real_data_copy = real_data.copy()
synthetic_data_copy = synthetic_data.copy()

real_data_copy['class'] = 0
synthetic_data_copy['class'] = 1
combined_data = pd.concat([real_data_copy, synthetic_data_copy], ignore_index=True)
combined_data = combined_data.sample(frac=1, random_state=42).reset_index(drop=True)
combined_data.head()

Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,fractal_dimension1,...,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3,Diagnosis,class
0,10.709474,17.476695,69.398366,338.236485,0.10858,0.109826,0.072458,0.026157,0.188342,0.0744,...,75.051423,363.288781,0.14131,0.219094,0.26466,0.098792,0.309017,0.09297,1,1
1,12.241138,19.178663,81.63011,448.164079,0.107425,0.174473,0.127247,0.042195,0.212838,0.070528,...,89.02578,497.285206,0.143261,0.424374,0.31742,0.125984,0.30277,0.090073,1,1
2,20.18,19.54,133.8,1250.0,0.1133,0.1489,0.2133,0.1259,0.1724,0.06053,...,146.0,1479.0,0.1665,0.2942,0.5308,0.2173,0.3032,0.08075,0,0
3,12.15921,14.295529,78.301981,500.142786,0.090964,0.082822,0.052864,0.02574,0.154891,0.061356,...,87.967693,659.868033,0.127386,0.090102,0.074071,0.046125,0.200809,0.063206,1,1
4,13.682891,21.491931,87.941412,608.258029,0.09077,0.091571,0.079846,0.058876,0.161545,0.05833,...,96.477916,781.448539,0.134271,0.218461,0.198601,0.136597,0.219831,0.075611,0,1


In [29]:
X = combined_data.drop(columns=[target_variable])
y = combined_data[target_variable]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Train model to distinguish synthetic from real data

In [30]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Balanced Accuracy:", balanced_accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

Accuracy: 0.8004246284501062
Balanced Accuracy: 0.7956660231660231
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.78      0.74       175
           1       0.86      0.81      0.84       296

    accuracy                           0.80       471
   macro avg       0.79      0.80      0.79       471
weighted avg       0.81      0.80      0.80       471

Confusion Matrix:
 [[136  39]
 [ 55 241]]
