 # Optimaalinen arkkitehtuuri täytyy löytää sovellus- ja datakohtaisesti. Tavoitteena on, että saadaan malli, joka on suorituskykyinen ennestään tuntemattomien mallien kanssa - ei opetusdatan kanssa. Oppimisprosessiin voidaan vaikuttaa hyperparametreilla, esimerkiksi oppimisnopeudella, batch-koolla, epochien ja kerrosten lukumäärällä. Grid search on yksi tekniikka, jolla voidaan hakea sopivat hyperparametrit mallin optimoimiseen.

In [1]:
import os
import pandas as pd
import numpy as np
import shutil

from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Dropout, GaussianNoise
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
from tensorflow.keras.models import load_model

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from datetime import datetime

# Otetaan käyttöön train ja test_eval, jotka oli muokattu jo valmiiksi data_preprocessing - tiedostossa.  

In [2]:
df_train_full = pd.read_csv("https://raw.githubusercontent.com/hetasks/loppuprojektiSyvaOppiminen/main/df_train_full.csv", sep=",", index_col = 0)
df_test_eval = pd.read_csv("https://raw.githubusercontent.com/hetasks/loppuprojektiSyvaOppiminen/main/df_test_eval.csv", sep=",", index_col = 0)

In [3]:
df_train_full

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status;
1923,60,2.0,1.0,1.0,0.0,1.0,0.0,2.0,1.0,40,1.0,1.0,20,2,100,0.0
2239,63,1.0,4.0,2.0,0.0,2.0,1.0,3.0,1.0,60,1.0,1.0,7,3,41,0.0
3793,60,2.0,0.0,1.0,0.0,1.0,1.0,3.0,1.0,25,1.0,1.0,18,1,94,0.0
3594,60,2.0,0.0,2.0,1.0,2.0,1.0,3.0,1.0,90,0.0,0.0,12,4,12,1.0
1483,66,2.0,1.0,3.0,1.0,3.0,0.0,2.0,0.0,30,1.0,1.0,12,8,90,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1131,49,2.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,16,1.0,1.0,4,2,62,0.0
1434,55,2.0,1.0,2.0,0.0,2.0,0.0,2.0,1.0,63,1.0,0.0,14,1,64,0.0
667,57,2.0,1.0,2.0,0.0,2.0,0.0,2.0,1.0,51,1.0,1.0,19,2,68,0.0
2500,62,0.0,0.0,1.0,1.0,2.0,1.0,3.0,1.0,40,1.0,1.0,12,7,46,1.0


In [4]:
df_test_eval

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status;
169,53,2.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,13,1.0,1.0,14,1,100,0.0
1689,43,2.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,1,1.0,1.0,12,1,106,0.0
2403,62,2.0,1.0,0.0,0.0,0.0,3.0,1.0,1.0,15,1.0,1.0,5,1,67,0.0
1965,56,2.0,1.0,1.0,0.0,1.0,0.0,2.0,1.0,48,1.0,1.0,3,1,41,0.0
85,46,2.0,1.0,1.0,2.0,4.0,1.0,3.0,1.0,40,0.0,0.0,12,11,71,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2566,60,2.0,1.0,1.0,0.0,1.0,3.0,1.0,1.0,30,1.0,1.0,17,3,61,0.0
3996,69,2.0,1.0,0.0,1.0,2.0,0.0,2.0,1.0,19,1.0,1.0,21,9,88,1.0
2590,63,2.0,1.0,1.0,2.0,4.0,1.0,3.0,1.0,29,1.0,1.0,14,10,67,0.0
882,42,2.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,9,1.0,1.0,6,1,70,0.0


 # "Otetaan käyttöön oppimiskäyrän piirtofunktio:"

In [5]:
def learning_curves(history, i, accuracy, pdf):
        fig = plt.figure(figsize=(10, 4))
        hist_dict = history.history
        epochs = [x+1 for x in history.epoch]
        plt.subplot(1,2,1)
        plt.plot(epochs, hist_dict['loss'], label="Opetusvirhe")
        plt.plot(epochs, hist_dict['val_loss'], label="Validointivirhe")
        plt.title('Opetus- ja validointivirhe')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()
        plt.subplot(1,2,2)
        plt.plot(epochs, hist_dict['accuracy'], label="Opetustarkkuus")
        plt.plot(epochs, hist_dict['val_accuracy'], label="Validointitarkkuus")
        plt.title('Opetus- ja validointitarkkuus')
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.suptitle(f"cfg nro. {i}: >>> {accuracy*100:.5f}")
        plt.tight_layout()
        pdf.savefig(fig)
        plt.close(fig)

 # "Tehdään sanakirja. Avaimiksi tulee hyperparametreja ja muita, joihin halutaan vaikuttaa. Seuraavissa tulostetaan kirjasto, jotta saadaan listana tulostettua tämä sanakirja listana ja kaikki sen eri kombinaaatiot."
 

In [6]:
param_grid = {
        'layers': [[15, 11], [30, 10]],
        'reg': [None, 0.001],
        'dout': [[0.0, 0.0], [0.1, 0.0]],
        'act': ['relu', 'elu'],
        'init': ['he_uniform'],
        'noise': [0.0],
        'test_pct': [0.20],
        'lr': [0.001, 0.0001],
        'epochs': [50],
        'b_size': [12],
        'num_classes': [1]
}

In [7]:
from sklearn.model_selection import ParameterGrid

In [8]:
list(ParameterGrid(param_grid))

[{'act': 'relu',
  'b_size': 12,
  'dout': [0.0, 0.0],
  'epochs': 50,
  'init': 'he_uniform',
  'layers': [15, 11],
  'lr': 0.001,
  'noise': 0.0,
  'num_classes': 1,
  'reg': None,
  'test_pct': 0.2},
 {'act': 'relu',
  'b_size': 12,
  'dout': [0.0, 0.0],
  'epochs': 50,
  'init': 'he_uniform',
  'layers': [15, 11],
  'lr': 0.001,
  'noise': 0.0,
  'num_classes': 1,
  'reg': 0.001,
  'test_pct': 0.2},
 {'act': 'relu',
  'b_size': 12,
  'dout': [0.0, 0.0],
  'epochs': 50,
  'init': 'he_uniform',
  'layers': [15, 11],
  'lr': 0.0001,
  'noise': 0.0,
  'num_classes': 1,
  'reg': None,
  'test_pct': 0.2},
 {'act': 'relu',
  'b_size': 12,
  'dout': [0.0, 0.0],
  'epochs': 50,
  'init': 'he_uniform',
  'layers': [15, 11],
  'lr': 0.0001,
  'noise': 0.0,
  'num_classes': 1,
  'reg': 0.001,
  'test_pct': 0.2},
 {'act': 'relu',
  'b_size': 12,
  'dout': [0.0, 0.0],
  'epochs': 50,
  'init': 'he_uniform',
  'layers': [30, 10],
  'lr': 0.001,
  'noise': 0.0,
  'num_classes': 1,
  'reg': None,
 

# "Nähdään myös listan pituus

In [9]:
len(list(ParameterGrid(param_grid)))

32

# Tehdään konfiguraatiot -tietorakenne.

In [10]:
cfgs = {i: cfg for i, cfg in enumerate(list(ParameterGrid(param_grid)), 1)}
for key, value in cfgs.items():
    print(key, value)

1 {'act': 'relu', 'b_size': 12, 'dout': [0.0, 0.0], 'epochs': 50, 'init': 'he_uniform', 'layers': [15, 11], 'lr': 0.001, 'noise': 0.0, 'num_classes': 1, 'reg': None, 'test_pct': 0.2}
2 {'act': 'relu', 'b_size': 12, 'dout': [0.0, 0.0], 'epochs': 50, 'init': 'he_uniform', 'layers': [15, 11], 'lr': 0.001, 'noise': 0.0, 'num_classes': 1, 'reg': 0.001, 'test_pct': 0.2}
3 {'act': 'relu', 'b_size': 12, 'dout': [0.0, 0.0], 'epochs': 50, 'init': 'he_uniform', 'layers': [15, 11], 'lr': 0.0001, 'noise': 0.0, 'num_classes': 1, 'reg': None, 'test_pct': 0.2}
4 {'act': 'relu', 'b_size': 12, 'dout': [0.0, 0.0], 'epochs': 50, 'init': 'he_uniform', 'layers': [15, 11], 'lr': 0.0001, 'noise': 0.0, 'num_classes': 1, 'reg': 0.001, 'test_pct': 0.2}
5 {'act': 'relu', 'b_size': 12, 'dout': [0.0, 0.0], 'epochs': 50, 'init': 'he_uniform', 'layers': [30, 10], 'lr': 0.001, 'noise': 0.0, 'num_classes': 1, 'reg': None, 'test_pct': 0.2}
6 {'act': 'relu', 'b_size': 12, 'dout': [0.0, 0.0], 'epochs': 50, 'init': 'he_uni

# "Otetaan käyttöön timestamp"

In [11]:
time = datetime.now().strftime('%Y%m%dT%H%M')
models_path = f"gs_dnn_{time}"

# Alustetaan polku ja samoin pdf

In [12]:
if not os.path.exists(models_path):
        os.mkdir(models_path)
   
pdf = PdfPages(os.path.join(models_path, "learning_curves.pdf"))

# "Tehdään funktioita ennen grid_search - funktiota."

In [13]:
def train_model(df_train_full, cfg, models_path):
        df_train, df_val = split_data(df_train_full, cfg['test_pct'])
    
        X_train = df_train.iloc[:,0:15].to_numpy()
        y_train = df_train.iloc[:,15].to_numpy()
   
        X_val = df_val.iloc[:,0:15].to_numpy()
        y_val = df_val.iloc[:,15].to_numpy()
        
        model = build_model(X_train.shape[1], cfg)
   
        sgd = Adam(amsgrad=True, learning_rate=cfg['lr'])
        model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
   
        checkpoint_dir = os.path.sep.join([models_path, 'tmp'])
        checkpoint_filepath = os.path.sep.join([checkpoint_dir, 'checkpoint'])
  
        cbs_list = [
            ModelCheckpoint(checkpoint_filepath, monitor='val_accuracy', mode='max', save_weights_only=True, verbose=0),
            CSVLogger(os.path.sep.join([models_path, 'training.log']), append=True)
        ]
   
        history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=cfg['epochs'], batch_size=cfg['b_size'], callbacks=cbs_list, verbose=0)
   
        open(os.path.sep.join([models_path, 'training.log']), 'a').write(f"\n")
                                                                         
        model.load_weights(checkpoint_filepath)
        shutil.rmtree(checkpoint_dir)
   
        return model, history

# Jaetaan data

In [14]:
def split_data(df, test_pct):
        df_train, df_val = train_test_split(df, test_size=test_pct, stratify=df['Status;'])
        return df_train, df_val

In [15]:
def build_model(input_shape, cfg):
        reg = l2(cfg['reg']) if cfg['reg'] is not None else None
   
        input_data = Input(shape=(input_shape,))
        x = GaussianNoise(cfg['noise'])(input_data)
   
        for layer_size in cfg['layers']:
            x = Dropout(cfg['dout'][0])(x)
            x = Dense(layer_size, activation=cfg['act'], kernel_initializer=cfg['init'], kernel_regularizer=reg)(x)
    
        x = Dropout(cfg['dout'][1])(x)
   
        output = Dense(cfg['num_classes'], activation="sigmoid")(x)
        model = Model(input_data, output)
        return model

In [16]:
def evaluate_model(model, df_test_eval):
        X_test_eval = df_test_eval.iloc[:,0:15].to_numpy()
        y_test_eval = df_test_eval.iloc[:,15].to_numpy()
    
        _, accuracy = model.evaluate(X_test_eval, y_test_eval, verbose=0)
        print(f"Evaluation accuracy: {accuracy*100:.3f}")
        return accuracy

# "Otetaan muuttujaan nimeltä "scores" talteen opetusdata, testidata, konfiguraatiot ja polku sekä pdf"

In [17]:
def grid_search(df_train_full, df_test_eval, cfgs, models_path, pdf):
        scores = []
        print(f"Yhteensä {len(cfgs)} konfiguraatiota, aloitetaan... {models_path}")
        for i, cfg in cfgs.items():
            open(os.path.sep.join([models_path, 'training.log']), 'a').write(f"Nykyinen konfiguraatio {i}: {cfg} \\n\\n")
            model, history = train_model(df_train_full, cfg, models_path)
            accuracy = evaluate_model(model, df_test_eval)
            learning_curves(history, i, accuracy, pdf)
            scores.append((i, accuracy, model))
            open(os.path.sep.join([models_path, "all_configs.txt"]), 'a').write(f"{i}: {cfg} \\n\\n")
            if len(scores) > 5:
                scores.sort(key=lambda tup: tup[1], reverse=True)
    
                # Tämä if-lauseen sisälle!\n",
                del scores[-1]
        pdf.close()
        return scores

In [18]:
scores = grid_search(df_train_full, df_test_eval, cfgs, models_path, pdf)

Yhteensä 32 konfiguraatiota, aloitetaan... gs_dnn_20231019T0938
Evaluation accuracy: 88.557
Evaluation accuracy: 90.299
Evaluation accuracy: 90.050
Evaluation accuracy: 90.547
Evaluation accuracy: 89.801
Evaluation accuracy: 90.299
Evaluation accuracy: 90.050
Evaluation accuracy: 89.552
Evaluation accuracy: 89.055
Evaluation accuracy: 89.801
Evaluation accuracy: 90.050
Evaluation accuracy: 85.075
Evaluation accuracy: 90.547
Evaluation accuracy: 90.050
Evaluation accuracy: 89.055
Evaluation accuracy: 91.045
Evaluation accuracy: 91.542
Evaluation accuracy: 90.547
Evaluation accuracy: 88.806
Evaluation accuracy: 89.552
Evaluation accuracy: 90.547
Evaluation accuracy: 89.801
Evaluation accuracy: 90.050
Evaluation accuracy: 90.050
Evaluation accuracy: 89.801
Evaluation accuracy: 89.552
Evaluation accuracy: 85.075
Evaluation accuracy: 87.313
Evaluation accuracy: 90.796
Evaluation accuracy: 90.299
Evaluation accuracy: 86.567
Evaluation accuracy: 86.567


In [19]:
os.listdir(models_path)

['all_configs.txt', 'learning_curves.pdf', 'training.log']

In [20]:
for i, accuracy, model in scores:
        print(i, accuracy, model)
        filepath = os.path.sep.join([models_path, f'{i}_acc_{accuracy*100:.5f}.hdf5'])
        model.save(filepath)

17 0.9154228568077087 <keras.engine.functional.Functional object at 0x000001C0724622B0>
16 0.9104477763175964 <keras.engine.functional.Functional object at 0x000001BFDCE8BC40>
29 0.9079601764678955 <keras.engine.functional.Functional object at 0x000001C083F59700>
4 0.9054726362228394 <keras.engine.functional.Functional object at 0x000001C0647838E0>
13 0.9054726362228394 <keras.engine.functional.Functional object at 0x000001BFC25DEB50>


In [21]:
os.listdir(models_path)

['13_acc_90.54726.hdf5',
 '16_acc_91.04478.hdf5',
 '17_acc_91.54229.hdf5',
 '29_acc_90.79602.hdf5',
 '4_acc_90.54726.hdf5',
 'all_configs.txt',
 'learning_curves.pdf',
 'training.log']

In [22]:
for file in os.listdir(models_path):
        if file.endswith('.hdf5'):
            filename = os.path.sep.join([models_path, file])
            model = load_model(filename)
            print(filename, model)

gs_dnn_20231019T0938\13_acc_90.54726.hdf5 <keras.engine.functional.Functional object at 0x000001C08135E730>
gs_dnn_20231019T0938\16_acc_91.04478.hdf5 <keras.engine.functional.Functional object at 0x000001C081368550>
gs_dnn_20231019T0938\17_acc_91.54229.hdf5 <keras.engine.functional.Functional object at 0x000001C08168AA00>
gs_dnn_20231019T0938\29_acc_90.79602.hdf5 <keras.engine.functional.Functional object at 0x000001BFC3B867F0>
gs_dnn_20231019T0938\4_acc_90.54726.hdf5 <keras.engine.functional.Functional object at 0x000001C07F9676D0>
