In [19]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam


In [66]:
# Load iris dataset
iris = datasets.load_breast_cancer()
X = iris.data  # we only take the first four features.
y = iris.target
# Normalize the data
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Convert data to pandas DataFrame
real_data = pd.DataFrame(X)
real_labels = y


In [80]:
iris.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [None]:
# One hot encode labels
one_hot_encoder = OneHotEncoder(sparse=False)
one_hot_labels = one_hot_encoder.fit_transform(np.array(real_labels).reshape(-1, 1))

NOISE_DIM = 100
NUM_CLASSES = 2
NUM_FEATURES = 30
BATCH_SIZE = 64
TRAINING_STEPS = 500
# Constants
# Generator
def create_generator():
    noise_input = Input(shape=(NOISE_DIM,))
    class_input = Input(shape=(NUM_CLASSES,))
    merged_input = Concatenate()([noise_input, class_input])
    hidden = Dense(128, activation='relu')(merged_input)
    output = Dense(NUM_FEATURES, activation='linear')(hidden)
    model = Model(inputs=[noise_input, class_input], outputs=output)
    return model


# Discriminator
def create_discriminator():
    data_input = Input(shape=(NUM_FEATURES,))
    class_input = Input(shape=(NUM_CLASSES,))
    merged_input = Concatenate()([data_input, class_input])
    hidden = Dense(128, activation='relu')(merged_input)
    output = Dense(1, activation='sigmoid')(hidden)
    model = Model(inputs=[data_input, class_input], outputs=output)
    return model
# cGAN
def create_cgan(generator, discriminator):
    noise_input = Input(shape=(NOISE_DIM,))
    class_input = Input(shape=(NUM_CLASSES,))
    generated_data = generator([noise_input, class_input])
    validity = discriminator([generated_data, class_input])
    model = Model(inputs=[noise_input, class_input], outputs=validity)
    return model


generator = create_generator()
# Create and compile the Discriminator
discriminator = create_discriminator()
discriminator.compile(loss='binary_crossentropy', optimizer=Adam())
gan = create_cgan(generator, discriminator)

# Ensure that only the generator is trained
discriminator.trainable = False

gan.compile(loss='binary_crossentropy', optimizer=Adam())

In [65]:
real_data.head(1)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,0


In [None]:
# Train GAN
for step in range(TRAINING_STEPS):
    # Select a random batch of real data with labels
    idx = np.random.randint(0, real_data.shape[0], BATCH_SIZE)
    real_batch = real_data.iloc[idx].values
    labels_batch = one_hot_labels[idx]

    # Generate a batch of new data
    noise = np.random.normal(0, 1, (BATCH_SIZE, NOISE_DIM))
    generated_batch = generator.predict([noise, labels_batch])

    # Train the discriminator
    real_loss = discriminator.train_on_batch([real_batch, labels_batch], np.ones((BATCH_SIZE, 1)))
    fake_loss = discriminator.train_on_batch([generated_batch, labels_batch], np.zeros((BATCH_SIZE, 1)))
    discriminator_loss = 0.5 * np.add(real_loss, fake_loss)

    # Train the generator
    generator_loss = gan.train_on_batch([noise, labels_batch], np.ones((BATCH_SIZE, 1)))

    if step % 100 == 0:
        print(f"Step: {step}, Discriminator Loss: {discriminator_loss}, Generator Loss: {generator_loss}")


In [83]:
# Generate instances for a given class
def generate_data(generator, data_class, num_instances):
    one_hot_class = one_hot_encoder.transform(np.array([[data_class]]))
    noise = np.random.normal(0, 1, (num_instances, NOISE_DIM))
    generated_data = generator.predict([noise, np.repeat(one_hot_class, num_instances, axis=0)])
    return pd.DataFrame(generated_data,columns=iris.feature_names)

In [84]:
generated_data = generate_data(generator, 1, 40)
generated_data



Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,0.386197,0.430119,0.386089,0.245277,0.234082,0.319339,0.050656,0.200825,0.389005,0.287857,...,0.31455,0.276671,0.229163,0.188168,0.449392,0.319546,0.446428,0.504266,0.293903,0.366978
1,0.369419,0.206298,0.352262,0.226191,0.240948,0.17422,0.039027,-0.108835,0.318523,0.151682,...,0.052576,0.273266,0.243587,0.089666,0.254069,0.121691,0.252544,0.340169,0.279351,0.122819
2,0.329358,0.441639,0.310998,0.288521,0.152576,0.256034,0.351475,0.297466,0.244187,0.452779,...,0.205266,0.361498,0.389768,0.294411,0.332537,0.185718,0.28122,0.533877,0.296112,0.085709
3,0.565993,0.066067,0.350571,0.216517,0.10598,0.229419,0.096651,-0.05036,0.42598,0.322431,...,0.067845,0.194928,0.107569,0.262432,0.151166,0.201589,0.287575,0.658456,-0.045699,0.114189
4,0.474208,0.28367,0.421432,0.250591,0.28774,0.293671,0.250811,0.390907,0.53908,0.227571,...,0.253736,0.233148,0.364838,0.314779,0.241509,0.051218,0.235337,0.520116,0.279817,0.23173
5,0.468026,0.223539,0.122982,0.20605,0.380983,0.358458,0.31078,0.284803,0.272772,0.461433,...,0.14719,0.37779,0.269374,0.34286,0.234561,0.065906,0.140395,0.37593,0.278314,0.108154
6,0.201679,0.381338,0.298234,0.213378,0.341426,0.403945,0.118311,0.18869,0.530882,0.198372,...,0.03398,0.337153,0.319736,-0.059347,0.176994,0.364039,0.270082,0.308225,0.127975,0.185774
7,0.315498,0.358282,0.249849,0.078857,0.210207,0.282692,0.230639,0.143786,0.396897,0.33536,...,0.384013,0.240521,0.179763,0.223506,0.305219,0.131533,0.235471,0.257324,0.100528,0.276636
8,0.32169,0.21913,0.257058,0.207339,0.166509,0.149121,0.083959,0.054531,0.158812,0.320911,...,0.19766,0.114888,0.280592,0.062215,0.39414,0.191486,0.207845,0.041907,0.2877,0.054814
9,0.242951,0.61382,0.551912,0.537407,0.394751,0.415202,0.28024,0.329586,0.237438,0.722003,...,0.298275,0.395009,0.335233,0.233243,0.50637,0.377747,0.574603,0.743009,0.386556,0.3183


In [85]:
# Generate 50 instances for each class
synthetic_data_class_0 = generate_data(generator, 0, 50)
synthetic_data_class_1 = generate_data(generator, 1, 50)

# Combine all synthetic data into a single DataFrame and apply inverse transform to bring it back to original scale
synthetic_data = pd.concat([synthetic_data_class_0, synthetic_data_class_1], ignore_index=True)
synthetic_data = pd.DataFrame(scaler.inverse_transform(synthetic_data),columns=iris.feature_names)

# Create corresponding class labels
synthetic_labels = [0]*50 + [1]*50 

# Add labels to the synthetic data
synthetic_data['class'] = synthetic_labels

# Save synthetic data as a CSV file
synthetic_data.to_csv(r'C:\Users\lia68085\test\synthetic_caneer_data.csv', index=False)





In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets

# Load the Iris dataset from sklearn
iris = datasets.load_breast_cancer()

real_data = pd.DataFrame(iris.data, columns=iris.feature_names)
real_data['class'] = iris.target
feature_names= iris.feature_names
# Load the synthetic dataset
synthetic_data = pd.read_csv(r'C:\Users\lia68085\test\synthetic_caneer_data.csv')

# For each feature, create a histogram for the real and synthetic data
for feature in feature_names:
    plt.figure(figsize=(10, 5))

    plt.subplot(1, 2, 1)
    plt.hist(real_data[feature], bins=20, alpha=0.5, color='g', label='Real')
    plt.title(f"Real Data: {feature}")
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.hist(synthetic_data[feature], bins=20, alpha=0.5, color='b', label='Synthetic')
    plt.title(f"Synthetic Data: {feature}")
    plt.legend()

    plt.show()
# Print the summary statistics for the real and synthetic data
print("Summary statistics for the real data:")
print(real_data.describe())
print("\nSummary statistics for the synthetic data:")
print(synthetic_data.describe())

# For each pair of features, create a scatter plot for the real and synthetic data
features = feature_names
for i in range(len(features)):
    for j in range(i+1, len(features)):
        plt.figure(figsize=(10, 5))

        plt.subplot(1, 2, 1)
        plt.scatter(real_data[features[i]], real_data[features[j]], alpha=0.5, color='g')
        plt.title(f"Real Data: {features[i]} vs {features[j]}")

        plt.subplot(1, 2, 2)
        plt.scatter(synthetic_data[features[i]], synthetic_data[features[j]], alpha=0.5, color='b')
        plt.title(f"Synthetic Data: {features[i]} vs {features[j]}")

        plt.show()


In [88]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load the synthetic dataset
synthetic_data = pd.read_csv(r'C:\Users\lia68085\test\synthetic_caneer_data.csv')


# Separate the features and the target variable
X = synthetic_data.drop(columns=['class'])
y = synthetic_data['class']

# Split the dataset into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the feature values
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a K-Nearest Neighbors model
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Make predictions on the test dataset
y_pred = knn.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[9 3]
 [1 7]]

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.75      0.82        12
           1       0.70      0.88      0.78         8

    accuracy                           0.80        20
   macro avg       0.80      0.81      0.80        20
weighted avg       0.82      0.80      0.80        20



In [89]:
from sklearn.metrics import accuracy_score


# Load the Iris dataset from sklearn
iris = datasets.load_breast_cancer()
real_data = pd.DataFrame(iris.data, columns=iris.feature_names)
real_labels = iris.target

# Standardize the real data
real_data = scaler.transform(real_data)

# Use the trained KNN model to make predictions on the real data
real_pred = knn.predict(real_data)

# Calculate the accuracy of the model on the real data
accuracy = accuracy_score(real_labels, real_pred)
print("Accuracy of the model on the real data: ", accuracy)

Accuracy of the model on the real data:  0.7785588752196837


In [90]:
X_train_real, X_test_real, y_train_real, y_test_real = train_test_split(real_data, real_labels, test_size=0.2, random_state=42)

# Standardize the feature values
scaler_real = StandardScaler()
X_train_real = scaler_real.fit_transform(X_train_real)
X_test_real = scaler_real.transform(X_test_real)

# Train a K-Nearest Neighbors model on the real data
knn_real = KNeighborsClassifier(n_neighbors=3)
knn_real.fit(X_train_real, y_train_real)

# Make predictions on the test dataset
y_pred_real = knn_real.predict(X_test_real)

# Evaluate the model
accuracy_real = accuracy_score(y_test_real, y_pred_real)
print("Accuracy of the model trained on real data: ", accuracy_real)

Accuracy of the model trained on real data:  0.9473684210526315


In [91]:
# Make predictions on the real test data with the model trained on synthetic data
y_pred_synthetic_model = knn.predict(X_test_real)

# Evaluate the model trained on synthetic data
accuracy_synthetic_model = accuracy_score(y_test_real, y_pred_synthetic_model)
print("Accuracy of the model trained on synthetic data: ", accuracy_synthetic_model)

# Make predictions on the real test data with the model trained on real data
y_pred_real_model = knn_real.predict(X_test_real)

# Evaluate the model trained on real data
accuracy_real_model = accuracy_score(y_test_real, y_pred_real_model)
print("Accuracy of the model trained on real data: ", accuracy_real_model)

Accuracy of the model trained on synthetic data:  0.868421052631579
Accuracy of the model trained on real data:  0.9473684210526315
