# Imports

In [1]:
from IPython.display import clear_output 
import pandas as pd
import warnings
import numpy as np

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, cohen_kappa_score, confusion_matrix, roc_auc_score, roc_curve, auc

!pip install scikit-optimize

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.preprocessing.text import Tokenizer
import keras
from keras.wrappers.scikit_learn import KerasClassifier
from skopt import BayesSearchCV
#Imblearn
from imblearn.metrics import geometric_mean_score, classification_report_imbalanced
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, TomekLinks
from imblearn.combine import SMOTEENN, SMOTETomek

Collecting scikit-optimize
[?25l  Downloading https://files.pythonhosted.org/packages/8b/03/be33e89f55866065a02e515c5b319304a801a9f1027a9b311a9b1d1f8dc7/scikit_optimize-0.8.1-py2.py3-none-any.whl (101kB)
[K     |███▎                            | 10kB 13.6MB/s eta 0:00:01[K     |██████▌                         | 20kB 10.5MB/s eta 0:00:01[K     |█████████▊                      | 30kB 7.2MB/s eta 0:00:01[K     |█████████████                   | 40kB 8.4MB/s eta 0:00:01[K     |████████████████▏               | 51kB 5.4MB/s eta 0:00:01[K     |███████████████████▍            | 61kB 5.8MB/s eta 0:00:01[K     |██████████████████████▊         | 71kB 5.9MB/s eta 0:00:01[K     |██████████████████████████      | 81kB 6.2MB/s eta 0:00:01[K     |█████████████████████████████▏  | 92kB 6.0MB/s eta 0:00:01[K     |████████████████████████████████| 102kB 3.3MB/s 
[?25hCollecting pyaml>=16.9
  Downloading https://files.pythonhosted.org/packages/15/c4/1310a054d33abc318426a956e7d6df0df



# Auxiliaries

In [5]:
class Resampling:

    def __init__(self, name):
        self.strategie = None
        self.name = name

        if name == "ENN":
            self.strategie = EditedNearestNeighbours(sampling_strategy='auto',
                                                     n_neighbors=3,
                                                     kind_sel='all',
                                                     n_jobs=None)
        elif name == "AllKnn":
            self.strategie = AllKNN(sampling_strategy='auto',
                                    n_neighbors=3,
                                    kind_sel='all',
                                    allow_minority=False,
                                    n_jobs=None)
        elif name == "RENN":
            self.strategie = RepeatedEditedNearestNeighbours(sampling_strategy='auto',
                                                             n_neighbors=3,
                                                             max_iter=100,
                                                             kind_sel='all',
                                                             n_jobs=None)

        elif name == "TomekLinks":
            self.strategie = TomekLinks(sampling_strategy='auto',  # resample all classes but the minority class;
                                        n_jobs=None)

        elif name == "SMOTE":
            self.strategie = SMOTE(sampling_strategy='auto',
                                   # equivalent to 'not majority': resample all classes but the majority class;
                                   k_neighbors=5,
                                   # number of nearest neighbours to used to construct synthetic samples.
                                   n_jobs=None,
                                   random_state=42)

        elif name == "BorderlineSMOTE":
            self.strategie = BorderlineSMOTE(random_state=42)

        elif name == "ADASYN":
            self.strategie = ADASYN(sampling_strategy='auto',
                                    n_neighbors=5,
                                    n_jobs=None,
                                    random_state=42)

        elif name == "SMOTEENN":
            self.strategie = SMOTEENN(sampling_strategy='auto',
                                      smote=None,
                                      enn=None,
                                      random_state=24)

        elif name == "SMOTETomek":
            self.strategie = SMOTETomek(sampling_strategy='auto',
                                        smote=None,
                                        tomek=None,
                                        random_state=42)

    def fit_resample(self, x, y):
        x_res, y_res = self.strategie.fit_resample(x, y)
        return x_res, y_res

In [3]:
def prepare_data(data, test_size, random_state, resampling=None):
    warnings.filterwarnings('ignore')
    if resampling is None:
        X_train, X_test, Y_train, Y_test = train_test_split(data['RequirementText'], data['Class'],
                                                            test_size=test_size,
                                                            stratify=data['Class'], random_state=random_state)

        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(X_train)

        x_train = tokenizer.texts_to_matrix(X_train, mode='tfidf')
        x_test = tokenizer.texts_to_matrix(X_test, mode='tfidf')

        vocab_size = len(tokenizer.word_index) + 1

        bin = LabelBinarizer()
        bin.fit(Y_train)

        y_train = bin.transform(Y_train)
        y_test = bin.transform(Y_test)

    else:
        strategie = Resampling(resampling)
        X_train, X_test, Y_train, Y_test = train_test_split(data['RequirementText'], data['Class'],
                                                            test_size=test_size,
                                                            stratify=data['Class'], random_state=random_state)
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(X_train)

        x_train = tokenizer.texts_to_matrix(X_train, mode='tfidf')
        x_test = tokenizer.texts_to_matrix(X_test, mode='tfidf')

        vocab_size = len(tokenizer.word_index) + 1

        encoder = LabelBinarizer()
        encoder.fit(Y_train)

        y_train = encoder.transform(Y_train)
        y_test = encoder.transform(Y_test)

        x_train, y_train = strategie.fit_resample(x_train, y_train)

    return vocab_size, x_train, y_train, x_test, y_test

In [4]:
class DataSet:
    def __init__(self, data):
        self.origin = {}
        self.tomek = {}
        self.smote = {}
        self.borderline_smote = {}
        self.smote_enn = {}
        self.smote_tomek = {}

        self.definir_datasets(data)

    def definir_datasets(self, data):
        # ORIGIN
        vocab_size_origin, x_train_origin, y_train_origin, x_test_origin, y_test_origin = prepare_data(data=data,
                                                                                                          test_size=0.2,
                                                                                                          random_state=42,
                                                                                                          resampling=None)
        self.set_data('origin', vocab_size_origin, x_train_origin, y_train_origin, x_test_origin, y_test_origin)


        # TOMEKLINKS
        vocab_size_tomek, x_train_tomek, y_train_tomek, x_test_tomek, y_test_tomek = prepare_data(data=data,
                                                                                                     test_size=0.2,
                                                                                                     random_state=42,
                                                                                                     resampling='TomekLinks')
        self.set_data('dataTomek', vocab_size_tomek, x_train_tomek, y_train_tomek, x_test_tomek, y_test_tomek)


        # SMOTE
        vocab_size_smote, x_train_smote, y_train_smote, x_test_smote, y_test_smote = prepare_data(data=data,
                                                                                                     test_size=0.2,
                                                                                                     random_state=42,
                                                                                                     resampling='SMOTE')
        self.set_data('dataSmote', vocab_size_smote, x_train_smote, y_train_smote, x_test_smote, y_test_smote)


        # BORDERLINE SMOTE
        vocab_size_bd_smote, x_train_bd_smote, y_train_bd_smote, x_test_bd_smote, y_test_bd_smote =prepare_data(
            data=data, test_size=0.2, random_state=42,
                                         resampling='BorderlineSMOTE')
        self.set_data('dataBoderlineSmote', vocab_size_bd_smote, x_train_bd_smote, y_train_bd_smote, x_test_bd_smote,
                      y_test_bd_smote)


        # SMOTEENN
        vocab_size_smoteenn, x_train_smoteenn, y_train_smoteenn, x_test_smoteenn, y_test_smoteenn = prepare_data(
            data=data, test_size=0.2, random_state=42, resampling='SMOTEENN')
        self.set_data('dataSmoteEnn', vocab_size_smoteenn, x_train_smoteenn, y_train_smoteenn, x_test_smoteenn,
                      y_test_smoteenn)


        # SMOTETOMEK
        vocab_size_smotetomek, x_train_smotetomek, y_train_smotetomek, x_test_smotetomek, y_test_smotetomek= prepare_data(
            data=data, test_size=0.2,  random_state=42, resampling='SMOTETomek')
        self.set_data('dataSmoteTomek', vocab_size_smotetomek, x_train_smotetomek, y_train_smotetomek, x_test_smotetomek,
                      y_test_smotetomek)


    def set_data(self, dataname, vocab_size, x_train, y_train, x_test, y_test):
        data = {'vocab_size': vocab_size, 'x_train': x_train, 'y_train': y_train, 'x_test': x_test, 'y_test': y_test}
        if dataname == 'origin':
            self.origin = data
        elif dataname == 'dataTomek':
            self.tomek = data
        elif dataname == 'dataSmote':
            self.smote = data
        elif dataname == 'dataBoderlineSmote':
            self.borderline_smote = data
        elif dataname == 'dataSmoteEnn':
            self.smote_enn = data
        elif dataname == 'dataSmoteTomek':
            self.smote_tomek = data

    def get_data(self, dataname):
        if dataname == 'origin':
            return self.origin['vocab_size'], self.origin['x_train'], self.origin['y_train'], self.origin['x_test'],\
                   self.origin['y_test']
        elif dataname == 'dataTomek':
            return self.tomek['vocab_size'], self.tomek['x_train'], self.tomek['y_train'], self.tomek['x_test'], \
                   self.tomek['y_test']
        elif dataname == 'dataSmote':
            return self.smote['vocab_size'], self.smote['x_train'], self.smote['y_train'], self.smote['x_test'], \
                   self.smote['y_test']
        elif dataname == 'dataBoderlineSmote':
            return self.borderline_smote['vocab_size'], self.borderline_smote['x_train'], \
                   self.borderline_smote['y_train'], self.borderline_smote['x_test'], self.borderline_smote['y_test']
        elif dataname == 'dataSmoteEnn':
            return self.smote_enn['vocab_size'], self.smote_enn['x_train'], self.smote_enn['y_train'], \
                   self.smote_enn['x_test'], self.smote_enn['y_test']
        elif dataname == 'dataSmoteTomek':
            return self.smote_tomek['vocab_size'], self.smote_tomek['x_train'], self.smote_tomek['y_train'], \
                   self.smote_tomek['x_test'], self.smote_tomek['y_test']

    def info_data(self, dataname):
        y_train, y_test = 0, 0
        if dataname == 'origin':
            y_train, y_test = self.origin['y_train'], self.origin['y_test']
        elif dataname == 'dataTomek':
            y_train, y_test = self.tomek['y_train'], self.tomek['y_test']
        elif dataname == 'dataSmote':
            y_train, y_test = self.smote['y_train'], self.smote['y_test']
        elif dataname == 'dataBorderlineSmote':
            y_train, y_test = self.borderline_smote['y_train'], self.borderline_smote['y_test']
        elif dataname == 'dataSmoteEnn':
            y_train, y_test = self.smote_enn['y_train'], self.smote_enn['y_test']
        elif dataname == 'dataSmoteTomek':
            y_train, y_test = self.smote_tomek['y_train'], self.smote_tomek['y_test']

        bt.plot_requirements_by_class(y_train, y_test)

# Hyperparameterization

## BayesSearch

In [7]:
def create_net_adamax(input_dim,
               nn1,
               nn2,
               dropout,
               l1,
               l2,
               act,
               learn_rate):

    loss_fn = 'categorical_crossentropy'

    opt = keras.optimizers.Adamax(learning_rate=learn_rate)

    reg = keras.regularizers.l1_l2(l1=l1, l2=l2)

    model = Sequential()
    model.add(Dense(nn1, input_dim=input_dim, activation=act, kernel_regularizer=reg))
    model.add(Dropout(dropout))

    if nn2 != 0:
        model.add(Dense(nn2, activation=act, kernel_regularizer=reg))
        model.add(Dropout(dropout))

    model.add(Dense(11, activation='softmax'))

    model.compile(loss=loss_fn, optimizer=opt, metrics=['accuracy'])

    return model

def create_net_adam(input_dim,
               nn1,
               nn2,
               dropout,
               l1,
               l2,
               act,
               learn_rate):

    loss_fn = 'categorical_crossentropy'

    opt = keras.optimizers.Adam(learning_rate=learn_rate)

    reg = keras.regularizers.l1_l2(l1=l1, l2=l2)

    model = Sequential()
    model.add(Dense(nn1, input_dim=input_dim, activation=act, kernel_regularizer=reg))
    model.add(Dropout(dropout))

    if nn2 != 0:
        model.add(Dense(nn2, activation=act, kernel_regularizer=reg))
        model.add(Dropout(dropout))

    model.add(Dense(11, activation='softmax'))

    model.compile(loss=loss_fn, optimizer=opt, metrics=['accuracy'])

    return model
  
def create_net_rmsprop(input_dim,
               nn1,
               nn2,
               dropout,
               l1,
               l2,
               act,
               learn_rate):

    loss_fn = 'categorical_crossentropy'

    opt = keras.optimizers.RMSprop(learning_rate=learn_rate)

    reg = keras.regularizers.l1_l2(l1=l1, l2=l2)

    model = Sequential()
    model.add(Dense(nn1, input_dim=input_dim, activation=act, kernel_regularizer=reg))
    model.add(Dropout(dropout))

    if nn2 != 0:
        model.add(Dense(nn2, activation=act, kernel_regularizer=reg))
        model.add(Dropout(dropout))

    model.add(Dense(11, activation='softmax'))

    model.compile(loss=loss_fn, optimizer=opt, metrics=['accuracy'])

    return model

def create_net_sgdm(input_dim,
                    nn1,
                    nn2,
                    dropout,
                    l1,
                    l2,
                    act,
                    learn_rate):

    loss_fn = 'categorical_crossentropy'

    opt = keras.optimizers.SGD(learning_rate=learn_rate, momentum=0.9)

    reg = keras.regularizers.l1_l2(l1=l1, l2=l2)

    model = Sequential()
    model.add(Dense(nn1, input_dim=input_dim, activation=act, kernel_regularizer=reg))
    model.add(Dropout(dropout))

    if nn2 != 0:
        model.add(Dense(nn2, activation=act, kernel_regularizer=reg))
        model.add(Dropout(dropout))

    model.add(Dense(11, activation='softmax'))

    model.compile(loss=loss_fn, optimizer=opt, metrics=['accuracy'])

    return model

In [16]:
def bayesSearchCV(dataset, optimizer, resampling):
  if optimizer=='sgdm':
    model = KerasClassifier(build_fn=create_net_sgdm, verbose=0)
  elif optimizer=='adamax':
    model = KerasClassifier(build_fn=create_net_adamax, verbose=0)
  elif optimizer=='adam':
    model = KerasClassifier(build_fn=create_net_adam, verbose=0)
  elif optimizer=='rmsprop':
    model = KerasClassifier(build_fn=create_net_rmsprop, verbose=0)
  
  vocab_size, x_train, y_train, x_test, y_test = dataset.get_data(resampling)

  input_size = [vocab_size]
  nn1 = [20, 50, 100, 150, 200]
  nn2 = [0, 20, 50, 100, 150, 200]
  batch_size = [10, 20, 30, 40, None]
  dropout = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
  l1 = [0.0, 0.01, 0.001, 0.0001]
  l2 = [0.0, 0.01, 0.001, 0.0001]
  act = ['tanh', 'relu', 'sigmoid', 'elu']
  learn_rate = [0.01, 0.001, 0.0001]
  epochs = [30, 40, 50, 100]

  params = dict(input_dim=input_size, nn1=nn1, nn2=nn2, dropout=dropout,
                  l1=l1, l2=l2, act=act, learn_rate=learn_rate, 
                  batch_size=batch_size, epochs=epochs)
  
  search = BayesSearchCV(estimator=model, search_spaces=params, cv=10, 
                          verbose=0, iid=False, n_iter=50, scoring='balanced_accuracy')
  search.fit(x_train, np.argmax(y_train, axis=1))

  clear_output()
  print(search.best_score_)
  print(search.best_params_)

## Dataset

In [9]:
dataset = DataSet(pd.read_csv('PROMISE_exp_preprocessed.csv'))

## Origin

In [18]:
bayesSearchCV(dataset, 'adamax', 'origin')

0.6180248917748917
OrderedDict([('act', 'relu'), ('batch_size', 30), ('dropout', 0.4), ('epochs', 50), ('input_dim', 1060), ('l1', 0.0), ('l2', 0.0001), ('learn_rate', 0.01), ('nn1', 150), ('nn2', 150)])


In [19]:
bayesSearchCV(dataset, 'sgdm', 'origin')

0.6098875661375661
OrderedDict([('act', 'elu'), ('batch_size', 30), ('dropout', 0.5), ('epochs', 40), ('input_dim', 1060), ('l1', 0.0), ('l2', 0.0001), ('learn_rate', 0.01), ('nn1', 100), ('nn2', 150)])


In [20]:
bayesSearchCV(dataset, 'rmsprop', 'origin')

0.6207296777296777
OrderedDict([('act', 'relu'), ('batch_size', 40), ('dropout', 0.4), ('epochs', 40), ('input_dim', 1060), ('l1', 0.0001), ('l2', 0.0001), ('learn_rate', 0.01), ('nn1', 50), ('nn2', 100)])


## Tomek

In [None]:
bayesSearchCV(dataset, 'adamax', 'dataTomek')

0.580396762253194
OrderedDict([('act', 'relu'), ('batch_size', 20), ('dropout', 0.3), ('epochs', 50), ('input_dim', 1060), ('l1', 0.0), ('l2', 0.0001), ('learn_rate', 0.01), ('nn1', 100), ('nn2', 150)])


In [None]:
bayesSearchCV(dataset, 'rmsprop', 'dataTomek')

0.5560799284595965
OrderedDict([('act', 'sigmoid'), ('batch_size', 10), ('dropout', 0.1), ('epochs', 30), ('input_dim', 1060), ('l1', 0.0), ('l2', 0.0001), ('learn_rate', 0.001), ('nn1', 100), ('nn2', 200)])


## Smote

In [None]:
bayesSearchCV(dataset, 'adamax', 'dataSmote')

0.9517358139031978
OrderedDict([('act', 'relu'), ('batch_size', 10), ('dropout', 0.1), ('epochs', 50), ('input_dim', 1060), ('l1', 0.0), ('l2', 0.0001), ('learn_rate', 0.001), ('nn1', 150), ('nn2', 50)])


In [None]:
bayesSearchCV(dataset, 'rmsprop', 'dataSmote')

0.9507271338435508
OrderedDict([('act', 'relu'), ('batch_size', 10), ('dropout', 0.2), ('epochs', 30), ('input_dim', 1060), ('l1', 0.0), ('l2', 0.0), ('learn_rate', 0.01), ('nn1', 100), ('nn2', 50)])


## BoderlineSmote

In [None]:
bayesSearchCV(dataset, 'adamax', 'dataBoderlineSmote')

0.9014634210857222
OrderedDict([('act', 'elu'), ('batch_size', 40), ('dropout', 0.2), ('epochs', 50), ('input_dim', 1060), ('l1', 0.0), ('l2', 0.0001), ('learn_rate', 0.01), ('nn1', 200), ('nn2', 100)])


In [None]:
bayesSearchCV(dataset, 'rmsprop', 'dataBoderlineSmote')

0.9030573464536558
OrderedDict([('act', 'tanh'), ('batch_size', 40), ('dropout', 0.3), ('epochs', 40), ('input_dim', 1060), ('l1', 0.0001), ('l2', 0.0), ('learn_rate', 0.001), ('nn1', 200), ('nn2', 50)])


## SmoteEnn

In [None]:
bayesSearchCV(dataset, 'adamax', 'dataSmoteEnn')

In [None]:
bayesSearchCV(dataset, 'rmsprop', 'dataSmoteEnn')

## SmoteTomek

In [None]:
bayesSearchCV(dataset, 'adamax', 'dataSmoteTomek')

0.9540489177746023
OrderedDict([('act', 'relu'), ('batch_size', 40), ('dropout', 0.0), ('epochs', 30), ('input_dim', 1060), ('l1', 0.0), ('l2', 0.0), ('learn_rate', 0.01), ('nn1', 200), ('nn2', 100)])


In [None]:
bayesSearchCV(dataset, 'rmsprop', 'dataSmoteTomek')

0.9465133208565298
OrderedDict([('act', 'relu'), ('batch_size', 30), ('dropout', 0.0), ('epochs', 40), ('input_dim', 1060), ('l1', 0.0001), ('l2', 0.0001), ('learn_rate', 0.001), ('nn1', 150), ('nn2', 150)])
