### Import packages

In [None]:
from keras.models import Model
from keras.layers import Input, Dense, Concatenate,Flatten, Activation, Add
from keras.layers import Convolution1D, BatchNormalization
from keras.layers import GlobalMaxPooling1D, MaxPooling1D
from keras.layers import Embedding, ThresholdedReLU
from keras.layers import AlphaDropout, Dropout, SpatialDropout1D
from keras.preprocessing import sequence
from keras.optimizers import Adam
import os
import numpy as np
import matplotlib.pyplot as plt
import keras_metrics

### Classes and functions

In [None]:
    """
    Class to handle loading and processing of raw datasets.
    """
    def __init__(self,
                 alphabet="abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}",
                 input_size, num_of_classes=2):
        """
        Initialization of a Data object.
        Args:
            data_source (str): Raw data file path
            alphabet (str): Alphabet of characters to index
            input_size (int): Size of input features
            num_of_classes (int): Number of classes in data
        """
        self.alphabet = alphabet
        self.alphabet_size = len(self.alphabet)
        self.dict = {}  # Maps each character to an integer
        self.no_of_classes = num_of_classes
        for idx, char in enumerate(self.alphabet):
            self.dict[char] = idx + 1
        self.length = input_size
       

    def load_data(self,samples_number,randoms,subfolder='train'):
        """
        Load raw data from the source file into data variable.
        Returns: None
        """
        ditep_dir = 'aclImdb'
        ditep_dir = os.path.join(ditep_dir, subfolder)
        labels = []
        texts = []
        data = []
        for label_type in ['neg', 'pos']:
            dir_name = os.path.join(ditep_dir, label_type)
            for fname in sorted(os.listdir(dir_name)):
                if fname[-4:] == '.txt':
                    f = open(os.path.join(dir_name, fname),encoding='utf-8',errors='ignore')
                    texts.append(f.read())
                    f.close()
                    if label_type == 'neg':
                        labels.append(0)
                    else:
                        labels.append(1)
        
        for i in range(samples_number):
            random_num=randoms[i]
            data.append((labels[random_num], texts[random_num]))  # format: (label, text)
        self.data = np.array(data)
        

    def get_all_data(self):
        """
        Return all loaded data from data variable.
        Returns:
            (np.ndarray) Data transformed from raw to indexed form with associated one-hot label.
        """
        data_size = len(self.data)
        start_index = 0
        end_index = data_size
        batch_texts = self.data[start_index:end_index]
        batch_indices = []
        #one_hot = np.eye(self.no_of_classes, dtype='int64')
        classes = []
        for c, s in batch_texts:
            batch_indices.append(self.str_to_indexes(s))        
            classes.append(int(c))#one_hot[int(c)])
        return np.asarray(batch_indices, dtype='int64'), np.asarray(classes)

    def str_to_indexes(self, s):
        """
        Convert a string to character indexes based on character dictionary.
        
        Args:
            s (str): String to be converted to indexes
        Returns:
            str2idx (np.ndarray): Indexes of characters in s
        """
        s = s.lower()
        max_length = min(len(s), self.length)
        str2idx = np.zeros(self.length, dtype='int64')
        for i in range(1, max_length + 1):
            c = s[-i]
            if c in self.dict:
                str2idx[i - 1] = self.dict[c]
        return str2idx


class CharCNNKim(object):
    """
    Class to implement the Character Level Convolutional Neural Network
    as described in Kim et al., 2015 (https://arxiv.org/abs/1508.06615)
    Their model has been adapted to perform text classification instead of language modelling
    by replacing subsequent recurrent layers with dense layer(s) to perform softmax over classes.
    """
    def __init__(self, input_size, alphabet_size, embedding_size,
                 conv_layers, fully_connected_layers,
                 num_of_classes, dropout_p,
                 optimizer='adam', loss='categorical_crossentropy'):
        """
        Initialization for the Character Level CNN model.
        Args:
            input_size (int): Size of input features
            alphabet_size (int): Size of alphabets to create embeddings for
            embedding_size (int): Size of embeddings
            conv_layers (list[list[int]]): List of Convolution layers for model
            fully_connected_layers (list[list[int]]): List of Fully Connected layers for model
            num_of_classes (int): Number of classes in data
            dropout_p (float): Dropout Probability
            optimizer (str): Training optimizer
            loss (str): Loss function
        """
        self.input_size = input_size
        self.alphabet_size = alphabet_size
        self.embedding_size = embedding_size
        self.conv_layers = conv_layers
        self.fully_connected_layers = fully_connected_layers
        self.num_of_classes = num_of_classes
        self.dropout_p = dropout_p
        self.optimizer = optimizer
        self.loss = loss
        self._build_model()  # builds self.model variable

    def _build_model(self):
        """
        Build and compile the Character Level CNN model
        Returns: None
        """
        # Input layer
        inputs = Input(shape=(self.input_size,), name='sent_input', dtype='int64')
        # Embedding layers
        x = Embedding(self.alphabet_size + 1, self.embedding_size, input_length=self.input_size)(inputs)
        # Convolution layers
        convolution_output = []
        for num_filters, filter_width,dummy in self.conv_layers:
            conv = Convolution1D(filters=num_filters,
                                 kernel_size=filter_width,
                                 activation='relu',
                                 name='Conv1D_{}_{}'.format(num_filters, filter_width))(x)
            pool = GlobalMaxPooling1D(name='MaxPoolingOverTime_{}_{}'.format(num_filters, filter_width))(conv)
            convolution_output.append(pool)
        x = Concatenate()(convolution_output)
        # Fully connected layers
        for fl in self.fully_connected_layers:
            x = Dense(fl, activation='selu', kernel_initializer='lecun_normal')(x)
            x = AlphaDropout(self.dropout_p)(x)
        # Output layer
        predictions = Dense(self.num_of_classes, activation='sigmoid')(x)
        # Build and compile model
        model = Model(inputs=inputs, outputs=predictions)
        model.compile(optimizer=self.optimizer, loss=self.loss, metrics=[recall])
        self.model = model
        print("CharCNNKim model built: ")
        self.model.summary()

    def train(self, training_inputs, training_labels,
              validation_inputs, validation_labels,
              epochs, batch_size, checkpoint_every=100):
        """
        Training function
        Args:
            training_inputs (numpy.ndarray): Training set inputs
            training_labels (numpy.ndarray): Training set labels
            validation_inputs (numpy.ndarray): Validation set inputs
            validation_labels (numpy.ndarray): Validation set labels
            epochs (int): Number of training epochs
            batch_size (int): Batch size
            checkpoint_every (int): Interval for logging to Tensorboard
        Returns: None
        """
        # Start training
        print("Training CharCNNKim model: ")
        history = self.model.fit(training_inputs, training_labels,
                       validation_data=(validation_inputs, validation_labels),
                       epochs=epochs,
                       batch_size=batch_size,
                       verbose=2)
        return history

    def test(self, testing_inputs, testing_labels, batch_size):
        """
        Testing function
        Args:
            testing_inputs (numpy.ndarray): Testing set inputs
            testing_labels (numpy.ndarray): Testing set labels
            batch_size (int): Batch size
        Returns: None
        """
        # Evaluate inputs
        return  self.model.evaluate(testing_inputs, testing_labels, batch_size=batch_size, verbose=1)
        
   

### Parameters setup

In [None]:
alphabet="abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
total_samples = 25000
training_samples = 20000
val = 5000
input_size = 2600   #count the characters in a medical records
alphabet_size = 69
embedding_size = 300
conv_layers = [[32,7,3],[32,5,-1]] #filters,kernel_size,maxpooling for every conv1d (obs:they have to be different)
fully_connected_layers = [10] #dense units for every dense
num_of_classes = 1
dropout_p = 0.2
optimizer = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, decay=0.01)
loss = 'binary_crossentropy'
epochs = 15 
batch_size = 16
threshold = 1e-6

### Data and models

In [None]:
training_data = Data(input_size=input_size)
randoms=list(map(int,(25000*np.random.rand(training_samples+val,1))))
training_data.load_data(training_samples,randoms[:training_samples])
x_train, y_train = training_data.get_all_data()
# Load validation data
validation_data = Data(input_size=input_size)
validation_data.load_data(val,randoms[training_samples:training_samples+val])
x_val, y_val = validation_data.get_all_data()

charcnnkim = CharCNNKim(input_size,alphabet_size,embedding_size,conv_layers,fully_connected_layers,
           num_of_classes,dropout_p,optimizer,loss)

history = charcnnkim.train(x_train,y_train,x_val,y_val,epochs,batch_size)

### Visualization

In [None]:
rec = history.history['recall']
val_rec = history.history['val_recall']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.figure()
plt.plot(epochs, rec, 'bo', label='Training rec')
plt.plot(epochs, val_rec, 'b', label='Validation rec')
plt.title('Training and validation rec')
plt.savefig('DITEP_charConv_rec.png')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.savefig('DITEP_charConv_loss.png')

test_data = Data(input_size=input_size)
randoms=list(map(int,(25000*np.random.rand(val,1))))
test_data.load_data(val,randoms,'test')
x_test, y_test = test_data.get_all_data()


# Evaluating the model on the test set
#model.load_weights('pre_trained_glove_model.h5')
scores = charcnnkim.test(x_test, y_test,batch_size) 
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])
scores.extend(['val_acc:'])
scores.extend(val_acc)
f=open('out_charconv.txt','w')
temp=''
for i in scores:
    temp+=str(i)
    temp+='\n'
f.write(temp)
f.close()