### Set up environnent, access to google drive, import librairies

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
%tensorflow_version 2.x

In [0]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('Problem with GPU device')

### Import librairies


In [0]:
from __future__ import print_function
import sys, os
sys.path.insert(0, os.path.abspath('/content/drive/My Drive/ML_Project_2/'))
import numpy as np
from helpers import *
import tensorflow as tf
from models_LSTM import *
from tensorflow.python.keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
import re

tf.compat.v1.get_default_graph


In [0]:
path_g = "/content/drive/My Drive/ML_Project_2/"

### Define useful functions


In [0]:
def not_treatment(data_train, data_val, data_test):
    """
    Apply negation processing 

    :param data_train: Training dataset
    :param data_test: Test dataset
    :return: Training and test dataset with negation processing
    """
    for idx, tweet in enumerate(data_train):
        tweet = ' '.join(tweet)
        data_train[idx] = re.sub(r"\w+n't\s?", 'not ', tweet)

    for idx, tweet in enumerate(data_test):
        tweet = ' '.join(tweet)
        data_test[idx] = re.sub(r"\w+n't\s?", 'not ', tweet)

    for idx, tweet in enumerate(data_val):
        tweet = ' '.join(tweet)
        data_val[idx] = re.sub(r"\w+n't\s?", 'not ', tweet)

    to_not = ['havent', 'doesnt', 'cant', 'dont', 'shouldnt', 'arent', 'couldnt', "didnt", "hadnt", "mightnt",
              "mustnt", "neednt", "wasnt", "wont", "wouldnt", 'neednt', 'isnt', 'werent']

    for word in to_not:
        data_train = [re.sub(r'\b' + word + r'\b', 'not', tweet) for tweet in data_train]
        data_test = [re.sub(r'\b' + word + r'\b', 'not', tweet) for tweet in data_test]
        data_val = [re.sub(r'\b' + word + r'\b', 'not', tweet) for tweet in data_val]

    return data_train, data_val, data_test


def tokenize(data_train, data_val, data_test, len_max_tweet, n_dim):
    """
    Tokenize tweets and load embedding matrix
    
    :param data_train: Training dataset
    :param data_test: Test dataset
    :param len_max_tweet: Maximum length of the tweets in the datasets
    :param n_dim: Embedding dimension
    :return: Tokenized training and test dataset, size of the dataset vocabulary and embedding matrix
    """

    # Create a tokenizer instance
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=1e6)

    # Fit the tokenizer on the training set
    tokenizer.fit_on_texts(data_train)

    # Tokenize data
    data_train = tokenizer.texts_to_sequences(data_train)
    data_test = tokenizer.texts_to_sequences(data_test)
    data_val = tokenizer.texts_to_sequences(data_val)

    # Compute vocabulary size
    vocab_size = len(tokenizer.word_index) + 1

    embeddings_dictionary = dict()
    glove_file = open(path_g + 'glove/glove.twitter.27B.' + str(n_dim) + 'd.txt', encoding="utf8")  

    for line in glove_file:
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        embeddings_dictionary[word] = vector_dimensions
    glove_file.close()

    embedding_matrix = np.zeros((vocab_size, n_dim))
    for word, index in tokenizer.word_index.items():
        embedding_vector = embeddings_dictionary.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

    embedding_layer = Embedding(vocab_size, n_dim, weights=[embedding_matrix], input_length=len_max_tweet,
                                trainable=False)

    return data_train, data_val, data_test, embedding_layer

### Load data

In [0]:
print("Loading Data ...")
# Load full dataset or not: 'f' or 'nf'
full='f'
processed=False

if processed:
    data_train = np.load('/content/drive/My Drive/ML_Project_2/Processed_Data/data_train_pr_' + full + '_sl5' + '.npy', allow_pickle=True)
    data_test = np.load('/content/drive/My Drive/ML_Project_2/Processed_Data/data_test_pr_sl5' + '.npy', allow_pickle=True)
    labels = np.load('/content/drive/My Drive/ML_Project_2/Processed_Data/labels_train_'+ full +'_sl5.npy')
    dataset_type = 'processed'

else:
    data_train, labels, data_test = get_raw_data(path_g, full)
    dataset_type = 'raw'

# If labels are -1 instead of 0
labels = np.where(labels == -1, 0, labels)

In [0]:
perm = np.random.permutation(data_train.shape[0])
data_train = data_train[perm]
labels = labels[perm]

# To train without the full set
n_train = -1

if n_train > 0:
    data_train = data_train[:n_train]
    labels = labels[:n_train]


In [0]:
print("Computing maximal length of tweets", flush=True)

# Max length of tweet (after removed not in vocab words)
len_max_tweet = np.max([len(tweet) for tweet in data_train])
len_max_tweet = np.max((len_max_tweet, np.max([len(tweet) for tweet in data_test])))

In [0]:
print("Start to convert negative words", flush=True)

# Negation processing
data_train, _, data_test = not_treatment(data_train, data_test.copy(), data_test)

In [0]:
print("Start to tokenize", flush=True)

n_dim = 200
data_train, _, data_test, embedding_layer = tokenize(data_train, data_test.copy(), data_test, len_max_tweet, n_dim)

data_train = pad_sequences(data_train, padding='post', maxlen=len_max_tweet)
data_test = pad_sequences(data_test, padding='post', maxlen=len_max_tweet)

In [0]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score

# Define neural network parameters
filters_lstm, batch_size = 400, 64
epochs = 5

model = build_model_lstm_emb_(filters_lstm, embedding_layer)

model.summary()

model.fit(x=data_train, y=labels, epochs=epochs, verbose=1, batch_size=batch_size)


### Make prediction and generate a submission file

In [0]:
y_pred = np.ndarray.flatten(model.predict_classes(data_test, batch_size=batch_size))

# Replace for submission
y_pred = np.where(y_pred == 0, -1, y_pred)


In [0]:
csv_name = path_g + 'sub_LSTM'

create_csv_submission(y_pred, csv_name + '.csv')
print("Output name:", csv_name)