## Set Up

In [None]:
## imports (if tensorflow.keras.xxx doesn't work, try just keras.xxx)

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding, Input
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam, SGD
import tensorflow as tf
import tensorflow.keras.backend as K
import gensim
import nltk
import pandas as pd
import numpy as np

In [None]:
## hyper-parameters

max_features = 2000
maxlen = 12
batch_size = 128
embedding_dims = 300
filters = 60
kernel_size = 30
hidden_dims = 200
epochs = 20

In [None]:
## load pre-trained word2vec model, see notes.txt for how to download the .bin file

w2v_model_name = '../GoogleNews-vectors-negative300.bin'
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_model_name, binary = True)

In [None]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
tf.config.experimental.set_memory_growth(physical_devices[0], True)

## Helper Functions

In [None]:
def clean_text(text_arr):
    '''remove non-alphabetic tokens and filter out stopwords
    
    Args:
        text_arr: list of strings, each representing sentences
    
    Returns:
        2d list of strings, each representing words
    '''
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokenizer = nltk.tokenize.TweetTokenizer()
    clean = []
    
    for sentence in text_arr:
        tokens = tokenizer.tokenize(sentence)
        list_sentence = []
        for word in tokens:
            word = word.strip('#').lower()
            if word.isalpha() and not word in stop_words:
                list_sentence.append(word)
        clean.append(list_sentence)
    return clean


def text_to_vec_concatenation(text_arr, embedding_model, maxlen):
    '''embed sentences to vectors by concatenating word vectors together
    
    Args:
        text_arr: 2d list of strings, each representing words
    
    Returns:
        numpy array of shape (number of sentences, word vector_size*max_len)
    '''
    vector_array = []
    for i, sentence in enumerate(text_arr):
        sentence_embedding = []
        for word in sentence:
            if word in embedding_model.vocab:
                sentence_embedding += embedding_model[word].tolist()
        vector_array.append(sentence_embedding)
    np_array = sequence.pad_sequences(vector_array, maxlen=maxlen*embedding_model.vector_size, padding='post', truncating='post', dtype='float32')
    return np_array

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def f1_loss(y_true, y_pred):
    
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)

In [None]:
## load data

# load and clean text from .csv
df_train = pd.read_csv('../train.csv')
x_train_str = clean_text([each for each in df_train['text']])
y_train = np.asarray([each for each in df_train['target']], dtype='float32')

# convert text to word2vec vectors
x_train = text_to_vec_concatenation(x_train_str, w2v_model, maxlen)
print(x_train.shape)

In [None]:
model = Sequential()

# model.add(Embedding(max_features,
#                     embedding_dims,
#                     input_length=maxlen))
# model.add(Dropout(0.2))

# using word2vec
model.add(Input(shape=(embedding_dims*maxlen, 1)))

model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
model.add(GlobalMaxPooling1D())

model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

model.add(Dense(1))
model.add(Activation('sigmoid'))

In [None]:
## compile and train

optimizer = Adam(lr=0.0005)

model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy', f1_m])

model.fit(np.expand_dims(x_train, axis=2),
          y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.1)

In [None]:
## test

# load text data
df_test = pd.read_csv('../test.csv')
x_test_str = clean_text([each for each in df_test['text']])

# convert text to word2vec vectors
x_test = text_to_vec_concatenation(x_test_str, w2v_model, maxlen)

# get test results
sample_submission = pd.read_csv("../sample_submission.csv")
sample_submission["target"] = np.where(model.predict(np.expand_dims(x_test, axis=2)) > 0.5, 1, 0)
sample_submission.to_csv("submission.csv", index=False)