# HYPERPARAMETER TUNING FOR ANN

### IMPORT PACKAGES & FUNCTIONS

In [None]:
import re
import numpy as np
import pandas as pd
from nltk import WordNetLemmatizer
import keras
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout,Embedding, Bidirectional
from nltk.corpus import stopwords
!pip install keras_tuner
from keras_tuner import BayesianOptimization
import nltk 
from nltk import download
download('wordnet')
download('omw-1.4')
download('stopwords')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras_tuner
  Downloading keras_tuner-1.1.3-py3-none-any.whl (135 kB)
[K     |████████████████████████████████| 135 kB 4.8 MB/s 
[?25hCollecting kt-legacy
  Downloading kt_legacy-1.0.4-py3-none-any.whl (9.6 kB)
Collecting jedi>=0.10
  Downloading jedi-0.18.1-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 43.7 MB/s 
Installing collected packages: jedi, kt-legacy, keras-tuner
Successfully installed jedi-0.18.1 keras-tuner-1.1.3 kt-legacy-1.0.4


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def preprocessing(text):
    text = text.lower()
    text_cleaned = re.sub(r'[^0-9a-z_+\-*]', ' ', text).strip()
    lemm = WordNetLemmatizer()
    title = []
    for token in text_cleaned.split():
        token_lemm = lemm.lemmatize(token)
        if token_lemm not in stopwords.words('english'):
             title.append(lemm.lemmatize(token))
    return ' '.join(str(elem) for elem in title)

##IMPORT DATA FROM LOCAL FILES
Files to upload:
- test_category.csv
- test_data.csv
- train_category.csv
- train_data.csv

In [None]:
from google.colab import files
uploaded = files.upload()

Saving test_category.csv to test_category.csv
Saving test_data.csv to test_data.csv
Saving train_category.csv to train_category.csv
Saving train_data.csv to train_data.csv


In [None]:
train_data_df = pd.read_csv('/content/train_data.csv')
test_data_df = pd.read_csv('/content/test_data.csv')
train_label_df = pd.read_csv('/content/train_category.csv')
test_label_df = pd.read_csv('/content/test_category.csv')

In [None]:
train_data = train_data_df['headline']
test_data = test_data_df['headline']
train_label = train_label_df['labels'] 
test_label = test_label_df['labels']

## DATA PREPARATION

In [None]:
train_data = train_data.apply(lambda x: preprocessing(str(x)))
test_data = test_data.apply(lambda x: preprocessing(str(x)))

In [None]:
labels = ['t', 'e', 'b', 'm']
max_vocab = 116900
max_token = 20 
embedding_dim = 300
data_tokenizer = Tokenizer(num_words=max_vocab+1)
label_tokenizer = Tokenizer()
data_tokenizer.fit_on_texts(train_data)
label_tokenizer.fit_on_texts(labels)

In [None]:
train_data_sequences = data_tokenizer.texts_to_sequences(train_data)
test_data_sequences = data_tokenizer.texts_to_sequences(test_data)
train_label_sequences = label_tokenizer.texts_to_sequences(train_label)
test_label_sequences = label_tokenizer.texts_to_sequences(test_label)

In [None]:
train_data_pad = pad_sequences(train_data_sequences, maxlen=max_token, padding='pre', truncating='pre')
test_data_pad = pad_sequences(test_data_sequences, maxlen=max_token, padding='pre', truncating='pre')
train_label_pad = np.array(train_label_sequences)
test_label_pad = np.array(test_label_sequences)

In [None]:
def build_model(hp):
    model = Sequential()
    model.add(Embedding(max_vocab, embedding_dim))
    model.add(Dropout(0.5))
    model.add(Bidirectional(LSTM(units=hp.Int('num_of_neurons', min_value=100, max_value=500, step=50))))
    model.add(Dropout(0.5))
    model.add(Dense(5, activation=hp.Choice('activation', values=['softmax', 'sigmoid'])))
    model.compile(optimizer=keras.optimizers.Adam(hp.Choice('learning_rate', values=[0.8, 1e-3])),
                  loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

## SETTING MODELS AND PARAMETERS FOR TUNING
- BayesianOptimization: Make a bayesian optimization search over all combinations with the parameters and the models given. Unlike GridSearch, this method of hypertuning parameter is used to optimise time without losing good search results.

In [None]:
num_epochs = 1
tuner = BayesianOptimization(build_model,
    objective='accuracy',
    max_trials=5,
    executions_per_trial=3,
    directory='tuner',
    project_name='FT_models_DL_2')


In [None]:
tuner.search(train_data_pad, train_label_pad, epochs=num_epochs)
tuner.results_summary()

Trial 5 Complete [00h 19m 23s]
accuracy: 0.790035088857015

Best accuracy So Far: 0.7947789629300436
Total elapsed time: 02h 26m 30s
Results summary
Results in tuner/FT_models_DL_2
Showing 10 best trials
<keras_tuner.engine.objective.Objective object at 0x7f0eae51a990>
Trial summary
Hyperparameters:
num_of_neurons: 450
activation: sigmoid
learning_rate: 0.001
Score: 0.7947789629300436
Trial summary
Hyperparameters:
num_of_neurons: 500
activation: softmax
learning_rate: 0.001
Score: 0.7909614046414694
Trial summary
Hyperparameters:
num_of_neurons: 150
activation: sigmoid
learning_rate: 0.001
Score: 0.790035088857015
Trial summary
Hyperparameters:
num_of_neurons: 500
activation: sigmoid
learning_rate: 0.8
Score: 0.2620631655057271
Trial summary
Hyperparameters:
num_of_neurons: 200
activation: softmax
learning_rate: 0.8
Score: 0.25414738059043884
