<a href="https://colab.research.google.com/github/housemLassoued/ML-deployment/blob/main/tabular_data_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
try:
    from google.colab import drive
    COLAB = True
    print("Note: using Google CoLab")
    %tensorflow_version 2.x
except:
    print("Note: not using Google CoLab")
    COLAB = False

Note: using Google CoLab
Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU, BatchNormalization
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

In [None]:
# Charger les données
df = pd.read_csv("https://data.heatonresearch.com/data/t81-558/auto-mpg.csv", na_values=['NA', '?'])
display(df)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,1,ford ranger


In [None]:
# Utiliser seulement les colonnes nécessaires
#drop la colonne name
COLS_USED = ['cylinders', 'displacement', 'horsepower', 'weight',
             'acceleration', 'year', 'origin', 'mpg']
df = df[COLS_USED]
display(df)

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,year,origin,mpg
0,8,307.0,130.0,3504,12.0,70,1,18.0
1,8,350.0,165.0,3693,11.5,70,1,15.0
2,8,318.0,150.0,3436,11.0,70,1,18.0
3,8,304.0,150.0,3433,12.0,70,1,16.0
4,8,302.0,140.0,3449,10.5,70,1,17.0
...,...,...,...,...,...,...,...,...
393,4,140.0,86.0,2790,15.6,82,1,27.0
394,4,97.0,52.0,2130,24.6,82,2,44.0
395,4,135.0,84.0,2295,11.6,82,1,32.0
396,4,120.0,79.0,2625,18.6,82,1,28.0


In [None]:
# Gérer les valeurs manquantes
df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median())


In [None]:
# Séparer les données en ensembles d'entraînement et de test: 80% training 20% testing
x_train, x_test, y_train, y_test = train_test_split(
    df.drop("mpg", axis=1),
    df["mpg"],
    test_size=0.20,
    random_state=42,
)
display(x_train)

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,year,origin
3,8,304.0,150.0,3433,12.0,70,1
18,4,97.0,88.0,2130,14.5,70,3
376,4,91.0,68.0,2025,18.2,82,3
248,4,91.0,60.0,1800,16.4,78,3
177,4,115.0,95.0,2694,15.0,75,2
...,...,...,...,...,...,...,...
71,3,70.0,97.0,2330,13.5,72,3
106,8,350.0,180.0,4499,12.5,73,1
270,4,134.0,95.0,2515,14.8,78,3
348,4,89.0,62.0,2050,17.3,81,3


In [None]:
# Normaliser les données
scaler_x = MinMaxScaler()
scaler_y = MinMaxScaler()

x_train = scaler_x.fit_transform(x_train)
x_test = scaler_x.transform(x_test)
y_train = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
y_test = scaler_y.transform(y_test.values.reshape(-1, 1))
display(x_train)


array([[1.        , 0.60981912, 0.58100559, ..., 0.23809524, 0.        ,
        0.        ],
       [0.2       , 0.0749354 , 0.23463687, ..., 0.38690476, 0.        ,
        1.        ],
       [0.2       , 0.05943152, 0.12290503, ..., 0.60714286, 1.        ,
        1.        ],
       ...,
       [0.2       , 0.17054264, 0.27374302, ..., 0.4047619 , 0.66666667,
        1.        ],
       [0.2       , 0.05426357, 0.08938547, ..., 0.55357143, 0.91666667,
        1.        ],
       [0.2       , 0.0749354 , 0.        , ..., 0.77380952, 0.25      ,
        0.5       ]])

In [None]:
# Construire et entraîner le modèle de régression
regression_model = Sequential()
regression_model.add(Dense(50, input_dim=x_train.shape[1], activation='relu')) # imput layer  dimension de l'entree egale a nombre des lignes de x_train
regression_model.add(Dense(25, activation='relu'))
regression_model.add(Dense(12, activation='relu'))
regression_model.add(Dense(1))
regression_model.compile(loss='mean_squared_error', optimizer='adam')

monitor = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-3,
                                           patience=5, verbose=1, mode='auto',
                                           restore_best_weights=True)
regression_model.fit(x_train, y_train, validation_data=(x_test, y_test),
                     callbacks=[monitor], verbose=2, epochs=1000)

Epoch 1/1000
10/10 - 1s - loss: 0.1325 - val_loss: 0.0627 - 1s/epoch - 124ms/step
Epoch 2/1000
10/10 - 0s - loss: 0.0486 - val_loss: 0.0185 - 76ms/epoch - 8ms/step
Epoch 3/1000
10/10 - 0s - loss: 0.0198 - val_loss: 0.0125 - 71ms/epoch - 7ms/step
Epoch 4/1000
10/10 - 0s - loss: 0.0164 - val_loss: 0.0107 - 73ms/epoch - 7ms/step
Epoch 5/1000
10/10 - 0s - loss: 0.0134 - val_loss: 0.0088 - 60ms/epoch - 6ms/step
Epoch 6/1000
10/10 - 0s - loss: 0.0120 - val_loss: 0.0087 - 68ms/epoch - 7ms/step
Epoch 7/1000
10/10 - 0s - loss: 0.0110 - val_loss: 0.0078 - 65ms/epoch - 7ms/step
Epoch 8/1000
10/10 - 0s - loss: 0.0099 - val_loss: 0.0072 - 76ms/epoch - 8ms/step
Epoch 9/1000
10/10 - 0s - loss: 0.0092 - val_loss: 0.0068 - 69ms/epoch - 7ms/step
Epoch 10/1000
10/10 - 0s - loss: 0.0088 - val_loss: 0.0067 - 59ms/epoch - 6ms/step
Epoch 11/1000
10/10 - 0s - loss: 0.0082 - val_loss: 0.0065 - 54ms/epoch - 5ms/step
Epoch 12/1000
10/10 - 0s - loss: 0.0080 - val_loss: 0.0064 - 69ms/epoch - 7ms/step
Epoch 13/1000

<keras.src.callbacks.History at 0x79238b4db130>

In [None]:
# Évaluer le modèle de régression
pred = regression_model.predict(x_test)
score = np.sqrt(metrics.mean_squared_error(scaler_y.inverse_transform(pred), scaler_y.inverse_transform(y_test)))
print("Final score (RMSE) for regression model: {}".format(score))

Final score (RMSE) for regression model: 3.198201361718504


In [None]:
# Définir le modèle du générateur
def build_generator(latent_dim, output_dim):
    model = Sequential()
    model.add(Dense(128, input_dim=latent_dim))# input layer
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(256))# layer 1
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(512))# layer 2
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(output_dim, activation='tanh'))# output layer
    return model

In [None]:
# Définir le modèle du discriminateur
def build_discriminator(input_dim):
    model = Sequential()
    model.add(Dense(512, input_dim=input_dim))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(256))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(1, activation='sigmoid'))
    return model


In [None]:
# Paramètres du GAN
latent_dim = 100
input_dim = x_train.shape[1]


In [None]:
# Créer et compiler le discriminateur
discriminator = build_discriminator(input_dim)
discriminator.compile(loss='binary_crossentropy', optimizer=Adam(0.0002, 0.5), metrics=['accuracy'])


In [None]:
# Créer le générateur
generator = build_generator(latent_dim, input_dim)

In [None]:
# Créer le GAN en combinant générateur et discriminateur
z = tf.keras.layers.Input(shape=(latent_dim,))
generated_data = generator(z)
discriminator.trainable = False
validity = discriminator(generated_data)

gan = tf.keras.models.Model(z, validity)
gan.compile(loss='binary_crossentropy', optimizer=Adam(0.0002, 0.5))

In [None]:
# Entraîner le GAN
epochs = 10000
batch_size = 64
sample_interval = 1000


In [None]:
# Étiquettes pour les vraies et fausses données
real = np.ones((batch_size, 1))
fake = np.zeros((batch_size, 1))


In [None]:
for epoch in range(epochs):
    # Sélectionner un batch de vraies données
    idx = np.random.randint(0, x_train.shape[0], batch_size)
    real_data = x_train[idx]

    # Générer un batch de fausses données
    noise = np.random.normal(0, 1, (batch_size, latent_dim))
    generated_data = generator.predict(noise)

    # Entraîner le discriminateur
    d_loss_real = discriminator.train_on_batch(real_data, real)
    d_loss_fake = discriminator.train_on_batch(generated_data, fake)
    d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

    # Entraîner le générateur
    noise = np.random.normal(0, 1, (batch_size, latent_dim))
    g_loss = gan.train_on_batch(noise, real)

    # Afficher les progrès
    if epoch % sample_interval == 0:
        print(f"{epoch} [D loss: {d_loss[0]}] [D accuracy: {d_loss[1]*100}] [G loss: {g_loss}]")


[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
6000 [D loss: 0.6669841408729553] [D accuracy: 57.8125] [G loss: 0.9674615859985352]
7000 [D loss: 0.6030295789241791] [D accuracy: 67.96875] [G loss: 1.0571634769439697]
8000 [D loss: 0.5144566595554352] [D accuracy: 77.34375] [G loss: 1.1080008745193481]
9000 [D loss: 0.5049051642417908] [D accuracy: 82.8125] [G loss: 1.9299260377883911]


In [None]:
# Générer des données synthétiques
noise = np.random.normal(0, 1, (x_test.shape[0], latent_dim))
gen_data = generator.predict(noise)
gen_data = scaler_x.inverse_transform(gen_data)



In [None]:
# Convertir en DataFrame
gen_data_df = pd.DataFrame(gen_data, columns=df.drop("mpg", axis=1).columns)


In [None]:
# Afficher un aperçu des données synthétiques générées
print("Exemple de données synthétiques générées:")

display(gen_data_df)

Exemple de données synthétiques générées:


Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,year,origin
0,4.254925,90.813820,64.817451,2127.452881,18.343023,77.778275,2.383513
1,5.057703,98.623100,98.064217,2138.741943,13.563382,74.619446,2.932954
2,4.335258,92.977257,74.024559,2028.981689,15.503490,75.004776,2.682334
3,7.821858,382.247070,158.614899,4653.008301,14.858366,76.452110,1.030343
4,6.863536,247.134674,103.361816,3239.731934,14.511620,77.682487,0.999988
...,...,...,...,...,...,...,...
75,4.868673,111.529945,84.291618,2345.062012,14.666958,80.891029,2.750847
76,3.976996,80.199478,79.264870,2019.867188,15.180943,72.498177,2.112638
77,7.167943,289.986908,109.077705,3775.526123,16.119358,78.443405,0.969130
78,4.083072,95.845093,68.393005,2017.217896,18.226730,74.369408,1.062052


In [None]:
# Prédire les cibles synthétiques en utilisant le modèle de régression
gen_targets_pred = regression_model.predict(gen_data)
gen_targets_pred = scaler_y.inverse_transform(gen_targets_pred)

# Générer des cibles synthétiques
gen_targets = regression_model.predict(gen_data)
gen_targets = scaler_y.inverse_transform(gen_targets)

# Calculer le RMSE entre les cibles synthétiques générées et prédites
gen_score = np.sqrt(metrics.mean_squared_error(gen_targets, gen_targets_pred))
print("RMSE for synthetic data: {}".format(gen_score))

RMSE for synthetic data: 0.0
