<a href="https://colab.research.google.com/github/kristalys47/THS/blob/master/THS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np

np.random.seed(7)

from keras.models import Model
from keras.layers import Dense, Input, Dropout, Conv2D, Flatten, Reshape, AveragePooling2D, \
    Concatenate, ZeroPadding2D, Multiply

from keras.layers.embeddings import Embedding


class  TweetSentiment2DCNN2Channel:
    def __init__(self, max_sentence_len, embedding_builder):
        self.max_sentence_len = max_sentence_len
        self.embedding_builder = embedding_builder
        self.model = None

    def build(self, first_dropout=0.0, padding='same', filters=4, kernel_size=(1,1), strides=(1,1), activation='relu',
              dense_units=64, second_dropout=0.0):

        # Input Layer 1 - tweet in right order
        sentence_input = Input(shape=(self.max_sentence_len,), name="INPUT_1")
        reverse_sentence_input = Input(shape=(self.max_sentence_len,), name="INPUT_2")
        # Embedding layer
        embeddings_layer = self.pretrained_embedding_layer()
        embeddings1 = embeddings_layer(sentence_input)
        embeddings2 = embeddings_layer(reverse_sentence_input)

        # Reshape
        embeddings1= Reshape((self.max_sentence_len, self.embedding_builder.get_dimensions(), 1))(embeddings1)
        embeddings2= Reshape((self.max_sentence_len, self.embedding_builder.get_dimensions(), 1))(embeddings2)

        #stack both input to make it a 2 chanell input
        concat_embeddings = Concatenate(axis = -1)([embeddings1, embeddings2])
        print("concat_embeddings: ", concat_embeddings)
        # Reshape with channels
        #X = Reshape((self.max_sentence_len, self.embedding_builder.get_dimensions(), 1))(embeddings)

        # First convolutional layer
        kernel_width = self.embedding_builder.get_dimensions()
        kernel_height = 3
        kernel_size = (kernel_height, kernel_width)
        X = Conv2D(filters=20, kernel_size=kernel_size, strides=(1, 1), padding=padding, activation=activation,
                   name="CONV2D_1")(concat_embeddings)
        #X  = Conv2D(filters = 66, kernel_size = (kernel_height+2, 1),  strides=(1, 1), padding='same', activation=activation,
        #           name="CONV2D_2")(X)
        #MAX pooling
        pool_height =  self.max_sentence_len - kernel_height + 1  # assumes zero padding and stride of 1
        pool_size = (pool_height, 1)
        X = AveragePooling2D(pool_size=pool_size, name = "MAXPOOL_1")(X)

        #Flatten
        X = Flatten()(X)

        # Attention
        #att_dense = 70*20*1
        #attention_probs = Dense(att_dense, activation='softmax', name='attention_probs')(X)
        #attention_mul = Multiply(name='attention_multiply')([X, attention_probs])


        # # First dense layer
        dense_units = 128
        X = Dense(units=int(dense_units/2), activation='relu', name="DENSE_1")(X)
        X = Dropout(second_dropout, name="DROPOUT_1")(X)

        # # Second dense layer
        X = Dense(units=dense_units, activation='relu', name="DENSE_2")(X)
        X = Dropout(second_dropout, name="DROPOUT_2")(X)
        #
        # # Third layer
        X = Dense(units=int(dense_units/2), activation='relu', name="DENSE_3")(X)
        X = Dropout(second_dropout, name="DROPOUT_3")(X)

        # Final layer
        X = Dense(1, activation= "sigmoid", name="FINAL_SIGMOID")(X)
        # create the model
        self.model = Model(input=[sentence_input, reverse_sentence_input] , output=X)

    def pretrained_embedding_layer(self):
        # create Keras embedding layer
        word_to_idx, idx_to_word, word_embeddings = self.embedding_builder.read_embedding()
        #vocabulary_len = len(word_to_idx) + 1
        vocabulary_len = len(word_to_idx)
        emb_dimension = self.embedding_builder.get_dimensions()
        # get the matrix for the sentences
        embedding_matrix = word_embeddings
        #embedding_matrix = np.vstack([word_embeddings, np.zeros((vocabulary_len,))])

        # embedding layer
        embedding_layer = Embedding(input_dim=vocabulary_len, output_dim=emb_dimension, trainable=False, name="EMBEDDING")
        embedding_layer.build((None,))
        embedding_layer.set_weights([embedding_matrix])
        return embedding_layer

    def summary(self):
        self.model.summary()

    def compile(self, loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']):
        self.model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

    def fit(self, X, Y, epochs = 50, batch_size = 32, shuffle=True, callbacks=None, validation_split=0.0, class_weight=None):
        return self.model.fit(X, Y, epochs=epochs, batch_size=batch_size, shuffle=shuffle, callbacks=callbacks,
                       validation_split=validation_split, class_weight=class_weight, verbose=2)

    def evaluate(self, X_test, Y_test):
        return self.model.evaluate(X_test, Y_test)

    def predict(self, X):
        return self.model.predict(X)

    def get_sentiment(self, prediction):
        return np.argmax(prediction)

    #def sentiment_string(self, sentiment):
    #    return self.sentiment_map[sentiment]

    def save_model(self, json_filename, h5_filename):
        json_model = self.model.to_json()
        with open(json_filename, "w+") as json_file:
            json_file.write(json_model)
        self.model.save_weights(h5_filename)
        return

class TweetSentiment2DCNN1x12Channel(TweetSentiment2DCNN2Channel):
    def __init__(self, max_sentence_len, embedding_builder):
            super().__init__(max_sentence_len, embedding_builder)

    def build(self, first_dropout=0.0, padding='same', filters=4, kernel_size=(1, 1), strides=(1, 1),
              activation='relu', dense_units=64, second_dropout=0.0):

            # Input Layer 1 - tweet in right order
        sentence_input = Input(shape=(self.max_sentence_len,), name="INPUT_1")
        reverse_sentence_input = Input(shape=(self.max_sentence_len,), name="INPUT_2")
        # Embedding layer
        embeddings_layer = self.pretrained_embedding_layer()
        embeddings1 = embeddings_layer(sentence_input)
        embeddings2 = embeddings_layer(reverse_sentence_input)

        # Reshape
        embeddings1= Reshape((self.max_sentence_len, self.embedding_builder.get_dimensions(), 1))(embeddings1)
        embeddings2= Reshape((self.max_sentence_len, self.embedding_builder.get_dimensions(), 1))(embeddings2)

        #stack both input to make it a 2 chanell input
        concat_embeddings = Concatenate(axis = -1)([embeddings1, embeddings2])
        print("concat_embeddings: ", concat_embeddings)

        # one by one convolution
        onebyone = Conv2D(filters=32, kernel_size=(1,1), strides=(1, 1), padding=padding, activation=activation,
                   name="CONV_1X1_1")(concat_embeddings)
        kernel_width = self.embedding_builder.get_dimensions()
        kernel_height = 3
        kernel_size = (kernel_height, kernel_width)
        X = Conv2D(filters=64, kernel_size=kernel_size, strides=(1, 1), padding=padding, activation=activation,
                   name="CONV2D_1")(onebyone)
        # X  = Conv2D(filters = 66, kernel_size = (kernel_height+2, 1),  strides=(1, 1), padding='same', activation=activation,
        #           name="CONV2D_2")(X)
        # MAX pooling
        pool_height = self.max_sentence_len - kernel_height + 1  # assumes zero padding and stride of 1
        pool_size = (3, 1)
        #pool_size = (pool_height, 1)
        X = AveragePooling2D(pool_size=pool_size, name="MAXPOOL_1")(X)

        #X = Conv2D(filters=1, kernel_size=(1,1), strides=(1, 1), padding=padding, activation=activation,
        #           name="CONV2D_2")(X)

        # Flatten
        X = Flatten()(X)

        # # First dense layer
        dense_units = 128
        X = Dense(units=int(dense_units / 2), activation='relu', name="DENSE_1")(X)
        X = Dropout(second_dropout, name="DROPOUT_1")(X)

        # # Second dense layer
        X = Dense(units=dense_units, activation='relu', name="DENSE_2")(X)
        X = Dropout(second_dropout, name="DROPOUT_2")(X)
        #
        # # Third layer
        X = Dense(units=int(dense_units / 2), activation='relu', name="DENSE_3")(X)
        X = Dropout(second_dropout, name="DROPOUT_3")(X)

        # Final layer
        X = Dense(1, activation="sigmoid", name="FINAL_SIGMOID")(X)
        # create the model
        self.model = Model(input=[sentence_input, reverse_sentence_input], output=X)


class TweetSentiment2DCNN1x12Channelv2(TweetSentiment2DCNN2Channel):
    def __init__(self, max_sentence_len, embedding_builder):
            super().__init__(max_sentence_len, embedding_builder)

    def build(self, first_dropout=0.0, padding='same', filters=4, kernel_size=(1, 1), strides=(1, 1),
              activation='relu', dense_units=64, second_dropout=0.0):

            # Input Layer 1 - tweet in right order
        sentence_input = Input(shape=(self.max_sentence_len,), name="INPUT_1")
        reverse_sentence_input = Input(shape=(self.max_sentence_len,), name="INPUT_2")
        # Embedding layer
        embeddings_layer = self.pretrained_embedding_layer()
        embeddings1 = embeddings_layer(sentence_input)
        embeddings2 = embeddings_layer(reverse_sentence_input)

        # Reshape
        embeddings1= Reshape((self.max_sentence_len, self.embedding_builder.get_dimensions(), 1))(embeddings1)
        embeddings2= Reshape((self.max_sentence_len, self.embedding_builder.get_dimensions(), 1))(embeddings2)

        #stack both input to make it a 2 chanell input
        concat_embeddings = Concatenate(axis = -1)([embeddings1, embeddings2])
        print("concat_embeddings: ", concat_embeddings)

        # one by one convolution
        onebyone = Conv2D(filters=16, kernel_size=(1,1), strides=(1, 1), padding=padding, activation=activation,
                   name="CONV_1X1_1")(concat_embeddings)
        kernel_width = self.embedding_builder.get_dimensions()
        kernel_height = 3
        kernel_size = (kernel_height, kernel_width)
        X = Conv2D(filters=32, kernel_size=kernel_size, strides=(1, 1), padding=padding, activation=activation,
                   name="CONV2D_1")(onebyone)
        # X  = Conv2D(filters = 66, kernel_size = (kernel_height+2, 1),  strides=(1, 1), padding='same', activation=activation,
        #           name="CONV2D_2")(X)
        # MAX pooling
        pool_height = self.max_sentence_len - kernel_height + 1  # assumes zero padding and stride of 1
        pool_size = (pool_height, 1)
        X = AveragePooling2D(pool_size=pool_size, name="MAXPOOL_1")(X)

        # Flatten
        X = Flatten()(X)

        flatonebyone = Flatten()(onebyone)
        # concact one by one with MaxPoling
        X = Concatenate(axis=-1)([X, flatonebyone])
        # # First dense layer
        dense_units = 128
        X = Dense(units=int(dense_units / 2), activation='relu', name="DENSE_1")(X)
        X = Dropout(second_dropout, name="DROPOUT_1")(X)

        # # Second dense layer
        X = Dense(units=dense_units, activation='relu', name="DENSE_2")(X)
        X = Dropout(second_dropout, name="DROPOUT_2")(X)
        #
        # # Third layer
        X = Dense(units=int(dense_units / 2), activation='relu', name="DENSE_3")(X)
        X = Dropout(second_dropout, name="DROPOUT_3")(X)

        # Final layer
        X = Dense(1, activation="sigmoid", name="FINAL_SIGMOID")(X)
        # create the model
        self.model = Model(inputs=[sentence_input, reverse_sentence_input], outputs=X)

class TweetSentimentInception(TweetSentiment2DCNN2Channel):
    def __init__(self, max_sentence_len, embedding_builder):
            super().__init__(max_sentence_len, embedding_builder)

    def build(self, first_dropout=0.0, padding='same', filters=4, kernel_size=(1, 1), strides=(1, 1),
              activation='relu', dense_units=64, second_dropout=0.0):

        # Input Layer 1 - tweet in right order
        sentence_input = Input(shape=(self.max_sentence_len,), name="INPUT_1")
        reverse_sentence_input = Input(shape=(self.max_sentence_len,), name="INPUT_2")
        # Embedding layer
        embeddings_layer = self.pretrained_embedding_layer()
        embeddings1 = embeddings_layer(sentence_input)
        embeddings2 = embeddings_layer(reverse_sentence_input)

        # Reshape
        embeddings1= Reshape((self.max_sentence_len, self.embedding_builder.get_dimensions(), 1))(embeddings1)
        embeddings2= Reshape((self.max_sentence_len, self.embedding_builder.get_dimensions(), 1))(embeddings2)

        #stack both input to make it a 2 chanell input
        concat_embeddings = Concatenate(axis = -1)([embeddings1, embeddings2])
        print("concat_embeddings: ", concat_embeddings)

        #compute 1x1 convolution on input
        onebyone = Conv2D(filters=filters, kernel_size=(1,1), strides=(1, 1), padding=padding, activation=activation,
                   name="CONV_1X1_1")(concat_embeddings)


        #compute 3xdimension convolution on one by one
        kernel_width = self.embedding_builder.get_dimensions()
        kernel_height = 3
        kernel_size = (kernel_height, kernel_width)
        threebydim1 = Conv2D(filters=filters, kernel_size=kernel_size, strides=(1, 1), padding=padding, activation=activation,
                   name="CONV_3xdim_1")(onebyone)

        #compute 3xdimension convolution on input
        kernel_width = self.embedding_builder.get_dimensions()
        kernel_height = 3
        kernel_size = (kernel_height, kernel_width)
        threebydim2 = Conv2D(filters=filters, kernel_size=kernel_size, strides=(1, 1), padding=padding, activation=activation,
                   name="CONV_3xdim_2")(concat_embeddings)

        #compute 5xdimension convolution on one by one
        kernel_width = self.embedding_builder.get_dimensions()
        kernel_height = 5
        kernel_size = (kernel_height, kernel_width)
        fivebydim1 = Conv2D(filters=filters, kernel_size=kernel_size, strides=(1, 1), padding=padding, activation=activation,
                   name="CONV_5xdim_1")(onebyone)
        fivebydim1 = ZeroPadding2D((1, 0))(fivebydim1)

        #compute 5xdimension convolution on input
        kernel_width = self.embedding_builder.get_dimensions()
        kernel_height = 5
        kernel_size = (kernel_height, kernel_width)
        fivebydim2 = Conv2D(filters=filters, kernel_size=kernel_size, strides=(1, 1), padding=padding, activation=activation,
                   name="CONV_5xdim_2")(concat_embeddings)
        fivebydim2 = ZeroPadding2D((1,0))(fivebydim2)

        concat_layer = Concatenate(axis = -1)([threebydim1, threebydim2,fivebydim1, fivebydim2])

        final_onebyone = Conv2D(filters=1, kernel_size=(1,1), strides=(1, 1), padding=padding, activation=activation,
                   name="CONV_1X1_final")(concat_layer)

        #final_onebyone = MaxPooling2D((2,1))(final_onebyone)
        #final_onebyone = AveragePooling2D((2,1))(final_onebyone)

        # Flatten
        X = Flatten()(final_onebyone)
        #X = Dropout(0.10, name="DROPOUT_1")(X)

        # attention
        att_dense = 70
        attention_probs = Dense(att_dense, activation='softmax', name='attention_probs')(X)
        attention_mul = Multiply(name='attention_multiply')([X, attention_probs])

        X = Dense(units=int(dense_units / 1), activation='relu', name="DENSE_1")(attention_mul)
        X = Dense(units=int(dense_units / 2), activation='relu', name="DENSE_2")(X)
        X = Dense(units=int(dense_units / 4), activation='relu', name="DENSE_3")(X)

        # Final layer
        #X = Dense(1, activation="sigmoid", name="FINAL_SIGMOID")(X)
        X = Dense(3, activation="softmax", name="FINAL_SOFTMAX")(X)
        # create the model
        self.model = Model(input=[sentence_input, reverse_sentence_input], output=X)



class TweetSentimentInceptionOneChan(TweetSentiment2DCNN2Channel):
    def __init__(self, max_sentence_len, embedding_builder):
            super().__init__(max_sentence_len, embedding_builder)

    def build(self, first_dropout=0.0, padding='same', filters=4, kernel_size=(1, 1), strides=(1, 1),
              activation='relu', dense_units=64, second_dropout=0.0):

        # Input Layer 1 - tweet in right order
        sentence_input = Input(shape=(self.max_sentence_len,), name="INPUT_1")
        #reverse_sentence_input = Input(shape=(self.max_sentence_len,), name="INPUT_2")
        # Embedding layer
        embeddings_layer = self.pretrained_embedding_layer()
        embeddings1 = embeddings_layer(sentence_input)
        #embeddings2 = embeddings_layer(reverse_sentence_input)

        # Reshape
        embeddings1= Reshape((self.max_sentence_len, self.embedding_builder.get_dimensions(), 1))(embeddings1)
        #embeddings2= Reshape((self.max_sentence_len, self.embedding_builder.get_dimensions(), 1))(embeddings2)

        #stack both input to make it a 2 chanell input
        #concat_embeddings = Concatenate(axis = -1)([embeddings1, embeddings2])
        #print("concat_embeddings: ", concat_embeddings)

        #compute 1x1 convolution on input
        onebyone = Conv2D(filters=filters, kernel_size=(1,1), strides=(1, 1), padding=padding, activation=activation,
                   name="CONV_1X1_1")(embeddings1)


        #compute 3xdimension convolution on one by one
        kernel_width = self.embedding_builder.get_dimensions()
        kernel_height = 3
        kernel_size = (kernel_height, kernel_width)
        threebydim1 = Conv2D(filters=filters, kernel_size=kernel_size, strides=(1, 1), padding=padding, activation=activation,
                   name="CONV_3xdim_1")(onebyone)

        #compute 3xdimension convolution on input
        kernel_width = self.embedding_builder.get_dimensions()
        kernel_height = 3
        kernel_size = (kernel_height, kernel_width)
        threebydim2 = Conv2D(filters=filters, kernel_size=kernel_size, strides=(1, 1), padding=padding, activation=activation,
                   name="CONV_3xdim_2")(embeddings1)

        #compute 5xdimension convolution on one by one
        kernel_width = self.embedding_builder.get_dimensions()
        kernel_height = 5
        kernel_size = (kernel_height, kernel_width)
        fivebydim1 = Conv2D(filters=filters, kernel_size=kernel_size, strides=(1, 1), padding=padding, activation=activation,
                   name="CONV_5xdim_1")(onebyone)
        fivebydim1 = ZeroPadding2D((1, 0))(fivebydim1)

        #compute 5xdimension convolution on input
        kernel_width = self.embedding_builder.get_dimensions()
        kernel_height = 5
        kernel_size = (kernel_height, kernel_width)
        fivebydim2 = Conv2D(filters=filters, kernel_size=kernel_size, strides=(1, 1), padding=padding, activation=activation,
                   name="CONV_5xdim_2")(embeddings1)
        fivebydim2 = ZeroPadding2D((1,0))(fivebydim2)

        concat_layer = Concatenate(axis = -1)([threebydim1, threebydim2,fivebydim1, fivebydim2])
        #final_onebyone = AveragePooling2D((2,1))(concat_layer)
        final_onebyone = Conv2D(filters=filters*2, kernel_size=(1,1), strides=(1, 1), padding=padding, activation=activation,
                   name="CONV_1X1_final")(concat_layer)

        #final_onebyone = MaxPooling2D((2,1))(final_onebyone)
        #final_onebyone = AveragePooling2D((2,1))(final_onebyone)

        # Flatten
        X = Flatten()(final_onebyone)
        #X = Flatten()(concat_layer)

        #X = Dropout(0.10, name="DROPOUT_1")(X)

        # attention
        # att_dense = 70
        # attention_probs = Dense(att_dense, activation='softmax', name='attention_probs')(X)
        # attention_mul = Multiply(name='attention_multiply')([X, attention_probs])
        #
        # X = Dense(units=int(dense_units / 1), activation='relu', name="DENSE_1")(attention_mul)
        #X = Dense(units=int(dense_units / 2), activation='relu', name="DENSE_2")(X)
        #X = Dense(units=int(dense_units / 4), activation='relu', name="DENSE_3")(X)

        X = Dense(units=128, activation='relu', name="DENSE_2")(X)
        X = Dense(units=64, activation='relu', name="DENSE_3")(X)
        X = Dense(units=32, activation='relu', name="DENSE_4")(X)

        # Final layer
        #X = Dense(1, activation="sigmoid", name="FINAL_SIGMOID")(X)
        X = Dense(3, activation="softmax", name="FINAL_SOFTMAX")(X)
        # create the model
        self.model = Model(inputs=[sentence_input], outputs=X)


In [0]:

import sys

import numpy as np


class EmbeddingException(Exception):
    pass

class GloveEmbedding:
    def __init__(self, filename, dimensions =50):
        if not filename :
            raise Exception("Illegal file name.")
        if dimensions < 1:
            raise Exception("Illegal value for dimensions")

        self.filename = filename
        self.dimensions = dimensions

    def get_dimensions(self):
        return self.dimensions

    def parse_embedding(self, data_vector):

        result = []
        for n in data_vector:
            result.append(float(n))
        return result

    def read_embedding_bad(self):
        try:
            data_in = open(self.filename, "r")
        except Exception as e:
            msg  = sys.exc_info()[0]
            raise EmbeddingException(msg) from e
        else:
            i = 0
            word_to_idx = {}
            idx_to_word = {}
            word_to_vect = []
            with data_in:
                for line in enumerate(data_in):
                    #print(line)
                    parts = line[1].split()
                    word_part = parts[0]
                    vector_parts = parts[1:]
                    idx_to_word[i] = word_part
                    word_to_idx[word_part] = i
                    i = i+ 1
                    word_to_vect.append(self.parse_embedding(vector_parts))
            #add <unk> token
            #unk = np.random.rand(self.dimensions,)
            unk = np.ones((self.dimensions,))
            idx_to_word[i] = "<unk>"
            word_to_idx["<unk>"] = i
            word_to_vect.append(unk)
            np_word_to_vect = np.array(word_to_vect)
            return word_to_idx, idx_to_word, np_word_to_vect

    def read_embedding(self):
        try:
            data_in = open(self.filename, "r",  encoding='utf-8')
        except Exception as e:
            msg  = sys.exc_info()[0]
            raise EmbeddingException(msg) from e
        else:
            i = 1
            word_to_idx = {}
            word_to_idx['<EOF>'] = 0
            idx_to_word = {}
            idx_to_word[0] = None
            word_to_vect = []
            word_to_vect.append(np.zeros((self.dimensions,)))
            with data_in:
                for line in enumerate(data_in):
                    #print(line)
                    parts = line[1].split()
                    word_part = parts[0]
                    vector_parts = parts[1:]
                    idx_to_word[i] = word_part
                    word_to_idx[word_part] = i
                    i = i+ 1
                    word_to_vect.append(self.parse_embedding(vector_parts))
            #add <unk> token
            #unk = np.random.rand(self.dimensions,)
            unk = np.ones((self.dimensions,))
            idx_to_word[i] = "<unk>"
            word_to_idx["<unk>"] = i
            word_to_vect.append(unk)
            np_word_to_vect = np.array(word_to_vect)
            return word_to_idx, idx_to_word, np_word_to_vect




class Word2VecEmbedding:
    def __init__(self, filename, dimensions =50):
        if not filename :
            raise Exception("Illegal file name.")
        if dimensions < 1:
            raise Exception("Illegal value for dimensions")

        self.filename = filename
        self.dimensions = dimensions

    def get_dimensions(self):
        return self.dimensions

    def parse_embedding(self, data_vector):

        result = []
        for n in data_vector:
            result.append(float(n))
        return result

    def read_embedding_bad(self):
        try:
            data_in = open(self.filename, "r")
        except Exception as e:
            msg  = sys.exc_info()[0]
            raise EmbeddingException(msg) from e
        else:
            i = 0
            word_to_idx = {}
            idx_to_word = {}
            word_to_vect = []
            with data_in:
                for line in enumerate(data_in):
                    #print(line)
                    parts = line[1].split()
                    word_part = parts[0]
                    vector_parts = parts[1:]
                    idx_to_word[i] = word_part
                    word_to_idx[word_part] = i
                    i = i+ 1
                    word_to_vect.append(self.parse_embedding(vector_parts))
            #add <unk> token
            unk = np.random.rand(self.dimensions,)
            idx_to_word[i] = "<unk>"
            word_to_idx["<unk>"] = i
            word_to_vect.append(unk)
            np_word_to_vect = np.array(word_to_vect)
            return word_to_idx, idx_to_word, np_word_to_vect

    def read_embedding(self):
        try:
            data_in = open(self.filename, "r")
        except Exception as e:
            msg  = sys.exc_info()[0]
            raise EmbeddingException(msg) from e
        else:
            i = 1
            word_to_idx = {}
            word_to_idx['<EOF>'] = 0
            idx_to_word = {}
            idx_to_word[0] = None
            word_to_vect = []
            word_to_vect.append(np.zeros((self.dimensions,)))
            with data_in:
                for line in enumerate(data_in):
                    #print(line)
                    parts = line[1].split()
                    word_part = parts[0]
                    vector_parts = parts[1:]
                    idx_to_word[i] = word_part
                    word_to_idx[word_part] = i
                    i = i+ 1
                    word_to_vect.append(self.parse_embedding(vector_parts))
            #add <unk> token
            unk = np.random.rand(self.dimensions,)
            idx_to_word[i] = "<unk>"
            word_to_idx["<unk>"] = i
            word_to_vect.append(unk)
            np_word_to_vect = np.array(word_to_vect)
            return word_to_idx, idx_to_word, np_word_to_vect






In [0]:
EPSILON_VALUE = 0.000000001

class UnkWords:
    def __init__(self, word_to_idx):
        pass

class SentenceToIndices:

    def __init__(self, word_to_idx):
        self.word_to_idx = word_to_idx

    def map_sentence(self, sentence):
        result = []
        words = sentence.split()
        for w in words:
            if w in self.word_to_idx:
                result.append(self.word_to_idx[w])
            else:
                result.append(self.word_to_idx["<unk>"])
        return result

    def map_sentence_list(self, sentence_list):
        result = []
        max_len = 0
        counter_len = 0.0
        total_len = 0.0
        for s in sentence_list:
            mapped = self.map_sentence(s)
            mapped_len = len(mapped)
            counter_len = counter_len + 1
            total_len = total_len + mapped_len
            if mapped_len > max_len:
                max_len = mapped_len
                max_s = s
            result.append(mapped)
        print("max_len: ", max_len)
        print("avg_len: ", total_len/counter_len)
        return result, max_len

class PadSentences:
    def __init__(self, max_len):
        self.max_len = max_len

    def pad(self, sentence):
        padding_len = self.max_len - len(sentence)
        padding = []
        if (padding_len > 0 ):
            r = range(0, padding_len)
            for _  in r:
                padding.append(0)
        return sentence + padding

    def pad_list(self, sentence_list):
        result = []
        for s in sentence_list:
            result.append(self.pad(s))
        return result


class SentenceToEmbedding:
    def __init__(self, word_to_idx, idx_to_word, word_to_vect):
        self.word_to_idx = word_to_idx
        self.idx_to_word = idx_to_word
        self.word_to_vect = word_to_vect

    def map_sentence(self, sentence, max_len = 0):
        S = SentenceToIndices(self.word_to_idx)
        matrix = None
        mapped_sentence = S.map_sentence(sentence)
        for i in mapped_sentence:
            e = self.word_to_vect[i]
            if matrix is None:
                matrix = np.array(e)
            else:
                matrix = np.vstack([matrix, e])
        if max_len > 0:
            padding_len = max_len - len(mapped_sentence)
            #print("max_len: ", max_len)
            #print("len(mapped_sentence): ", len(mapped_sentence))
            #print("padding: ", padding_len)
            if padding_len > 0:
                shape = matrix[0].shape
                zero_vector = np.zeros(shape)
                for _ in range(0, padding_len):
                    matrix = np.vstack([matrix, zero_vector])
        return matrix

class SentenceToEmbeddingWithEPSILON:
    def __init__(self, word_to_idx, idx_to_word, word_to_vect):
        self.word_to_idx = word_to_idx
        self.idx_to_word = idx_to_word
        self.word_to_vect = word_to_vect

    def map_sentence(self, sentence, max_len = 0):
        S = SentenceToIndices(self.word_to_idx)
        matrix = None
        mapped_sentence = S.map_sentence(sentence)
        for i in mapped_sentence:
            e = self.word_to_vect[i]
            if matrix is None:
                matrix = np.array(e)
            else:
                matrix = np.vstack([matrix, e])
        if max_len > 0:
            padding_len = max_len - len(mapped_sentence)
            #print("max_len: ", max_len)
            #print("len(mapped_sentence): ", len(mapped_sentence))
            #print("padding: ", padding_len)
            if padding_len > 0:
                shape = matrix[0].shape
                if (len(mapped_sentence) == 1):
                    shape = (50,)
                zero_vector = np.ones(shape) * EPSILON_VALUE
                for _ in range(0, padding_len):
                    matrix = np.vstack([matrix, zero_vector])

        return matrix

class TrimSentences:
    def __init__(self, trim_size):
        self.trim_size = trim_size

    def trim(self, sentence):
        if len(sentence) > self.trim_size:
            return sentence[:self.trim_size]
        else:
            return sentence

    def trim_list(self, sentence_list):
        result = []
        for s in sentence_list:
            temp = self.trim(s)
            result.append(temp)
        return result
import numpy as np

EPSILON_VALUE = 0.000000001

class UnkWords:
    def __init__(self, word_to_idx):
        pass

class SentenceToIndices:

    def __init__(self, word_to_idx):
        self.word_to_idx = word_to_idx

    def map_sentence(self, sentence):
        result = []
        words = sentence.split()
        for w in words:
            if w in self.word_to_idx:
                result.append(self.word_to_idx[w])
            else:
                result.append(self.word_to_idx["<unk>"])
        return result

    def map_sentence_list(self, sentence_list):
        result = []
        max_len = 0
        counter_len = 0.0
        total_len = 0.0
        for s in sentence_list:
            mapped = self.map_sentence(s)
            mapped_len = len(mapped)
            counter_len = counter_len + 1
            total_len = total_len + mapped_len
            if mapped_len > max_len:
                max_len = mapped_len
                max_s = s
            result.append(mapped)
        print("max_len: ", max_len)
        print("avg_len: ", total_len/counter_len)
        return result, max_len

class PadSentences:
    def __init__(self, max_len):
        self.max_len = max_len

    def pad(self, sentence):
        padding_len = self.max_len - len(sentence)
        padding = []
        if (padding_len > 0 ):
            r = range(0, padding_len)
            for _  in r:
                padding.append(0)
        return sentence + padding

    def pad_list(self, sentence_list):
        result = []
        for s in sentence_list:
            result.append(self.pad(s))
        return result


class SentenceToEmbedding:
    def __init__(self, word_to_idx, idx_to_word, word_to_vect):
        self.word_to_idx = word_to_idx
        self.idx_to_word = idx_to_word
        self.word_to_vect = word_to_vect

    def map_sentence(self, sentence, max_len = 0):
        S = SentenceToIndices(self.word_to_idx)
        matrix = None
        mapped_sentence = S.map_sentence(sentence)
        for i in mapped_sentence:
            e = self.word_to_vect[i]
            if matrix is None:
                matrix = np.array(e)
            else:
                matrix = np.vstack([matrix, e])
        if max_len > 0:
            padding_len = max_len - len(mapped_sentence)
            #print("max_len: ", max_len)
            #print("len(mapped_sentence): ", len(mapped_sentence))
            #print("padding: ", padding_len)
            if padding_len > 0:
                shape = matrix[0].shape
                zero_vector = np.zeros(shape)
                for _ in range(0, padding_len):
                    matrix = np.vstack([matrix, zero_vector])
        return matrix

class SentenceToEmbeddingWithEPSILON:
    def __init__(self, word_to_idx, idx_to_word, word_to_vect):
        self.word_to_idx = word_to_idx
        self.idx_to_word = idx_to_word
        self.word_to_vect = word_to_vect

    def map_sentence(self, sentence, max_len = 0):
        S = SentenceToIndices(self.word_to_idx)
        matrix = None
        mapped_sentence = S.map_sentence(sentence)
        for i in mapped_sentence:
            e = self.word_to_vect[i]
            if matrix is None:
                matrix = np.array(e)
            else:
                matrix = np.vstack([matrix, e])
        if max_len > 0:
            padding_len = max_len - len(mapped_sentence)
            #print("max_len: ", max_len)
            #print("len(mapped_sentence): ", len(mapped_sentence))
            #print("padding: ", padding_len)

            if padding_len > 0:
                shape = matrix[0].shape
                if(len(mapped_sentence)==1):
                    shape=(50, )
                zero_vector = np.ones(shape) * EPSILON_VALUE
                for _ in range(0, padding_len):
                    matrix = np.vstack([matrix, zero_vector])
        return matrix

class TrimSentences:
    def __init__(self, trim_size):
        self.trim_size = trim_size

    def trim(self, sentence):
        if len(sentence) > self.trim_size:
            return sentence[:self.trim_size]
        else:
            return sentence

    def trim_list(self, sentence_list):
        result = []
        for s in sentence_list:
            temp = self.trim(s)
            result.append(temp)
        return result


In [0]:
import csv

class ErrorAnalysis:
    @staticmethod
    def store_errors(X, Y, Y_Pred, file_name):
        errors = Y != Y_Pred
        errors = errors * 1 # trick to convert bool to 0s and 1s
        i = 0
        with open("/content/drive/My Drive/Kristalys/" + file_name, "w") as f:
            out_f = csv.writer(f, delimiter=' ')
            for e in errors:
                if e:
                    data = []
                    tweet = X[i]
                    label = Y[i]
                    pred = Y_Pred[i]
                    data.append(tweet)
                    data.append(label)
                    data.append(pred)
                    out_f.writerow(data)

                i = i + 1
            f.flush()


In [0]:
from keras import backend as K


def precision(y_true, y_pred):


    #predicted positives
    predictions = K.round(y_pred)
    predicted_positives = K.sum(predictions)

    #true positives
    true_positives = K.sum(K.round(y_true * predictions))
    P = true_positives / (predicted_positives + K.epsilon())
    return P

def recall(y_true, y_pred):


    #predicted positives
    predictions = K.round(y_pred)

    #all positives
    all_positives = K.sum(y_true)

    #true positives
    true_positives = K.sum(K.round(y_true * predictions))

    R = true_positives / all_positives
    return R

def f1(y_true, y_pred):


    P = precision(y_true, y_pred)
    R = recall(y_true, y_pred)
    return 2*((P*R)/(P+R+K.epsilon()))


# def f1(y_true, y_pred):
#     def recall(y_true, y_pred):
#         """Recall metric.
#
#         Only computes a batch-wise average of recall.
#
#         Computes the recall, a metric for multi-label classification of
#         how many relevant items are selected.
#         """
#         print("y_true ", y_true.eval())
#         print("y_pred ", y_pred.eval())
#
#         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#         possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
#         recall = true_positives / (possible_positives + K.epsilon())
#         return recall
#
#     def precision(y_true, y_pred):
#         """Precision metric.
#
#         Only computes a batch-wise average of precision.
#
#         Computes the precision, a metric for multi-label classification of
#         how many selected items are relevant.
#         """
#         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#         predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
#         precision = true_positives / (predicted_positives + K.epsilon())
#         return precision
#     precision = precision(y_true, y_pred)
#     recall = recall(y_true, y_pred)
#     return 2*((precision*recall)/(precision+recall+K.epsilon()))
#
#
# def recall(y_true, y_pred):
#     """Recall metric.
#
#     Only computes a batch-wise average of recall.
#
#     Computes the recall, a metric for multi-label classification of
#     how many relevant items are selected.
#     """
#     print("y_true ", y_true)
#     print("y_pred ", y_pred)
#     true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#     possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
#     recall = true_positives / (possible_positives + K.epsilon())
#     return recall
#
# def precision(y_true, y_pred):
#     """Precision metric.
#
#     Only computes a batch-wise average of precision.
#
#     Computes the precision, a metric for multi-label classification of
#     how many selected items are relevant.
#     """
#     true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#     predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
#     precision = true_positives / (predicted_positives + K.epsilon())
#     return precision
#
# def tprate(y_true, y_pred):
#     return recall(y_true, y_pred)
#
# def fprate2(y_true, y_pred):
#     #invert true and negative so negative is 1  and true is 1.
#     y_true = 1 - y_true
#     #invert predictions so that we get 1 for the predictions originally set to 0
#     y_pred = 1 - y_pred
#     return recall(y_true, y_pred)

def fprate(y_true, y_pred):
    #predicted positives
    predictions = K.round(y_pred)
    predicted_positives = K.sum(predictions)

    #all positives
    all_positives = K.sum(y_true)

    #true positives
    true_positives = K.sum(K.round(y_true * predictions))

    false_positive = predicted_positives - true_positives

    #negatives
    y_false = 1 - y_true

    all_negatives = K.sum(y_false)
    fpr = false_positive / (all_negatives + K.epsilon())
    return fpr


# def fprate(y_true, y_pred):
#     # true positives
#     true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#     # predicted_positives = true_positives + false_positives
#     predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
#     # false_positives
#     false_positive = predicted_positives - true_positives
#     # Now work on negatives
#     y_false = 1 - y_true
#     possible_negatives = K.sum(K.round(K.clip(y_false, 0, 1)))
#     fprate = false_positive / (possible_negatives  + K.epsilon())
#     return fprate

def accuracy(y_true, y_pred):
    n_samples = len(y_pred)
    correct = 1 * (y_true == y_pred)
    return sum(correct) / n_samples


def calculate_cm_metrics(c_matrix, track):
    prec_0 = c_matrix[0][0] / (c_matrix[0][0] + c_matrix[1][0] + c_matrix[2][0])
    prec_1 = c_matrix[1][1] / (c_matrix[1][1] + c_matrix[0][1] + c_matrix[2][1])
    prec_2 = c_matrix[2][2] / (c_matrix[2][2] + c_matrix[1][2] + c_matrix[0][2])

    recall_0 = c_matrix[0][0] / (c_matrix[0][0] + c_matrix[0][1] + c_matrix[0][2])
    recall_1 = c_matrix[1][1] / (c_matrix[1][1] + c_matrix[1][0] + c_matrix[1][2])
    recall_2 = c_matrix[2][2] / (c_matrix[2][2] + c_matrix[2][0] + c_matrix[2][1])

    f1_0 = 2 * ((prec_0 * recall_0) / (prec_0 + recall_0))
    f1_1 = 2 * ((prec_1 * recall_1) / (prec_1 + recall_1))
    f1_2 = 2 * ((prec_2 * recall_2) / (prec_2 + recall_2))

    tn_0 = c_matrix[1][1] + c_matrix[1][2] + c_matrix[2][1] + c_matrix[2][2]
    tn_1 = c_matrix[0][0] + c_matrix[0][2] + c_matrix[2][0] + c_matrix[2][2]
    tn_2 = c_matrix[0][0] + c_matrix[0][1] + c_matrix[1][0] + c_matrix[1][1]

    spec_0 = tn_0 / (tn_0 + c_matrix[1][0] + c_matrix[2][0])
    spec_1 = tn_1 / (tn_1 + c_matrix[0][1] + c_matrix[2][1])
    spec_2 = tn_2 / (tn_2 + c_matrix[0][2] + c_matrix[1][2])

    t = track + ("Precision 0: {}\n" 
                "Precision 1: {}\n"
                "Precision 2: {}\n"
                "Recall 0: {}\n"
                "Recall 1: {}\n"
                "Recall 2: {}\n"
                "F1 Score 0: {}\n"
                "F1 Score 1: {}\n"
                "F1 Score 2: {}\n"
                "Specificity 0: {}\n"
                "Specificity 1: {}\n"
                "Specificity 2: {}\n").format(prec_0, prec_1, prec_2, recall_0, recall_1, recall_2, f1_0, f1_1,
                                                f1_2, spec_0, spec_1, spec_2)
    return prec_1, recall_1, f1_1, spec_1, t


In [0]:
import csv
import math

import matplotlib.pyplot as plt
import numpy as np
from keras.callbacks import TensorBoard
from keras.optimizers import SGD, Adam, RMSprop
from keras.regularizers import l2
from keras.utils import to_categorical
from sklearn.metrics import confusion_matrix
from sklearn.utils import class_weight



class ProcessTweetsCNN:
    def __init__(self, labeled_tweets_filename, embedding_filename):
        self.labeled_tweets_filename = labeled_tweets_filename
        self.embedding_filename = embedding_filename

    def plot(self, history):
        # summarize history for accuracy
        plt.figure(1)
        plt.plot(history.history['acc'])
        plt.plot(history.history['val_acc'])
        plt.title('model accuracy')
        plt.ylabel('accuracy')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')
        # summarize history for loss
        plt.figure(2)
        plt.plot(history.history['loss'])
        plt.plot(history.history['val_loss'])
        plt.title('model loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')
        plt.show()

    def process(self, json_filename, h5_filename, plot=False, epochs = 100, vect_dimensions = 50):
        # open the file with tweets
        X_all = []
        Y_all = []
        All  = []

        #with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f:
        with open(self.labeled_tweets_filename, "r", encoding="utf-8") as f:
            i = 0
            csv_file = csv.reader(f, delimiter=',')
            ones_count = 0

            for r in csv_file:
                if i != 0:
                    All.append(r)
                i = i + 1

        np.random.shuffle(All)

        ones_count = 0
        two_count = 0
        zero_count = 0
        for r in All:
            tweet = r[0]
            label = int(r[1])
            if (label == 0):
                zero_count += 1
            elif (label == 1):
                ones_count += 1
            else:
                two_count += 1
            # if (label == 2):
            #     label = 0
            # if (label == 1) and (ones_count <= 4611):
            #     X_all.append(tweet)
            #     Y_all.append(label)
            #     ones_count +=1
            # elif (label == 0):
            X_all.append(tweet)
            Y_all.append(label)

        print("len(Y_all): ", len(Y_all))
        class_weight_val = class_weight.compute_class_weight('balanced', np.unique(Y_all), Y_all)
        print("classes: ", np.unique(Y_all))
        print("counts for 0, 1, 2: ", zero_count, ones_count, two_count)
        print("class weight_val: ", class_weight_val)
        class_weight_dictionary = {0: class_weight_val[0], 1: class_weight_val[1], 2: class_weight_val[2]}
        print("dict: ", class_weight_dictionary)

        print("Data Ingested")
        # divide the data into training and test
        num_data = len(X_all)
        limit = math.ceil(num_data * 0.80)
        X_train_sentences = X_all
        Y_train = Y_all
        # Divide after conversions
        # divide the data into X_train, Y_train, X_test, Y_test
        #X_train_sentences = X_all[0: limit]
        #Y_train = Y_all[0: limit]
        #X_test_sentences = X_all[limit:]
        #Y_test = Y_all[limit:]
        #print("Data Divided")

        #Get embeeding
        #G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions)

        G = GloveEmbedding(self.embedding_filename, dimensions=50)
        word_to_idx, idx_to_word, embedding = G.read_embedding()
        S = SentenceToIndices(word_to_idx)
        X_train_indices, max_len  = S.map_sentence_list(X_train_sentences)
        print("Train data mappend to indices")
        if max_len % 2 !=0:
            max_len = max_len + 1

        P = PadSentences(max_len)
        X_train_pad = P.pad_list(X_train_indices)
        print("Train data padded")
        # TRIM
        trim_size = max_len
        #trim_size = 33
        Trim = TrimSentences(trim_size)
        X_train_pad = Trim.trim_list(X_train_pad)
        print("X[0], ", X_train_pad[0])
        #convert to numPY arrays
        X_train = np.array(X_train_pad)
        Y_train = np.array(Y_train)
        ones_count = np.count_nonzero(Y_train)
        zeros_count = len(Y_train) - ones_count
        print("ones count: ", ones_count)
        print("zeros count: ", zeros_count)
        print("two count: ", two_count)
        Y_train_old = Y_train
        Y_train = to_categorical(Y_train, num_classes=3)

        # Divide the data
        X_test_text = X_all[limit:]
        X_test = X_train[limit:]
        Y_test = Y_train[limit:]
        X_train = X_train[0: limit]
        Y_train = Y_train[0: limit]
        print ("data divided on value: ", limit)
        print("lengths X_train, Y_train: ", len(X_train), len(Y_train))
        print("lengths X_test, Y_test: ", len(X_test), len(Y_test))

        print("Train data convert to numpy arrays")
        #NN = TweetSentiment2DCNN(trim_size, G)
        #NN = TweetSentiment2LSTM2Dense(trim_size, G)
        #NN =TweetSentiment2LSTM2Dense3Layer(trim_size, G)
        #NN =TweetSentiment2LSTM2Dense4Layer(trim_size, G)
        NN = TweetSentimentInceptionOneChan(trim_size, G)
        #NN = TweetSentimentCNN(trim_size, G)
        #print("Build GRU")
        #NN = TweetSentimentGRUSM(max_len, G)

        print("model created")
        kernel_regularizer = l2(0.001)
        #kernel_regularizer = None
        NN.build(filters=11, first_dropout=0, second_dropout=0.05, padding='valid', dense_units=16)

        #NN.build(first_layer_units = max_len, second_layer_units = max_len, relu_dense_layer=16, dense_layer_units = 3,
        #         first_layer_dropout=0, second_layer_dropout=0, third_layer_dropout=0)
        print("model built")
        NN.summary()
        sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True)
        rmsprop = RMSprop(decay=0.003)
        adam = Adam(lr=0.1, decay=0.05)
        sgd = SGD(lr=0.05)
        NN.compile(optimizer='adam', loss="categorical_crossentropy", metrics=['accuracy', precision, recall, f1, fprate])
        print("model compiled")
        print("Begin training")
        callback = TensorBoard(log_dir="/tmp/logs")
        #class_weight = {0: 0.67, 1: 0.33}
        #class_weight = None
        history = NN.fit(X_train, Y_train, epochs=epochs, batch_size=32, callbacks=[callback], class_weight=class_weight_dictionary)
        print("Model trained")
        print("Predicting")
        print("len(X_test): ", X_test)
        preds = NN.predict(X_test)
        print("len(preds): ", len(preds))
        print("type preds: ", type(preds))
        print("preds before: ", preds)
        preds = np.argmax(preds, axis=1)
        print("preds: ", preds)
        print("len(preds): ", len(preds))
        Y_test = Y_train_old[limit:]
        print("Y test: ", Y_test)
        c_matrix = confusion_matrix(Y_test, preds)
        print("matrix: ", c_matrix)
        print("Storing Errors: ")
        ErrorAnalysis.store_errors(X_test_text, Y_test, preds, "errorcnn.csv")
        print("Errors stored")
        print("Confusion matrix: ")
        prec_1, recall_1, f1_1, spec_1, t = calculate_cm_metrics(c_matrix, '')
        print("C1-> presicion, recall, F1: ", prec_1, recall_1, f1_1)

        #
        # X_test_indices, max_len = S.map_sentence_list(X_test_sentences)
        # print("Test data mapped")
        # X_test_pad = P.pad_list(X_test_indices)
        # print("Test data padded")
        # X_test = np.array(X_test_pad)
        # Y_test = np.array(Y_test)
        # print("Test data converted to numpy arrays")
        # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback])
        # print("accuracy: ", acc)
        T = "I have a bad case of vomit"
        X_Predict = ["my zika is bad", "i love colombia", "my has been tested for ebola", "there is a diarrhea outbreak in the city"]
        X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict)
        i =0
        for s in X_Predict_Idx:
            print(str(i)+ ": ", s)
            i = i + 1
        print(X_Predict)
        X_Predict_Final = P.pad_list(X_Predict_Idx)
        X_Predict_Final = Trim.trim_list(X_Predict_Final)
        #X_Predict = [X_Predict]
        X_Predict_Final = np.array(X_Predict_Final)
        print("Predict: ", np.argmax(NN.predict(X_Predict_Final)))
        print("Storing model and weights")
        NN.save_model(json_filename, h5_filename)
        if plot:
            print("Ploting")
            self.plot(history)
        print("Done!")



In [55]:
from google.colab import drive
drive.mount('/content/drive')

def main():

    with open('/content/drive/My Drive/foo.txt', 'w') as f:
      f.write('Hello Google Drive!')
    !cat /content/drive/My\ Drive/foo.txt


    print("Working:")
    #P = ProcessTweetsWord2VecOnePass2DCNNv2_1("data/cleantextlabels3.csv", "trained/embedding3.csv")
    P = ProcessTweetsCNN('/content/drive/My Drive/Kristalys/cleantextlabels7.csv', '/content/drive/My Drive/Kristalys/glove.6B.50d.txt')
    #P = ProcessTweetsWord2VecTwoPassLSTMv2_1("data/cleantextlabels4.csv", "trained/embedding3-50d.csv")

    #Bueno el model12cnnv2
    # Excelente el de modellstmatt1 con attention
    # El mejor fue modellstmatt2 con attention
    # also good modellstmatt3
    # el 4 con dropout
    # 2, 3 y 4 son buenos
    P.process("modelcnnincepw6.json", "modelcnnincepw6.h5", plot=False, epochs=20)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [56]:
main()

Hello Google Drive!Working:
len(Y_all):  12500
classes:  [0 1 2]
counts for 0, 1, 2:  3850 7917 733
class weight_val:  [1.08225108 0.52629363 5.684402  ]
dict:  {0: 1.0822510822510822, 1: 0.5262936297419056, 2: 5.684402000909504}
Data Ingested
max_len:  72
avg_len:  34.21232
Train data mappend to indices
Train data padded
X[0],  [3203, 1607, 57, 5169, 16243, 782, 26, 26933, 7917, 18, 1628, 1145, 118, 13, 1120, 4780, 739, 82, 33, 8, 158, 248879, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
ones count:  8650
zeros count:  3850
two count:  733
data divided on value:  10000
lengths X_train, Y_train:  10000 10000
lengths X_test, Y_test:  2500 2500
Train data convert to numpy arrays
model created
model built
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Co