In [None]:
from tensorflow.keras.preprocessing import sequence, text
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding, Input
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D

from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras import regularizers
import tensorflow as tf
import tensorflow.keras.backend as K
import pandas as pd
import nltk
import numpy as np

In [None]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [None]:
max_features = 2000
maxlen = 15
batch_size = 128
embedding_dims = 30
filters = 60
kernel_size = 3
hidden_dims = 200
epochs = 30
l2_loss_lambda = None

In [None]:
def clean_text(text_arr):
    '''remove non-alphabetic tokens and filter out stopwords
    
    Args:
        text_arr: list of strings, each representing sentences
    
    Returns:
        2d list of strings, each representing words
    '''
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokenizer = nltk.tokenize.TweetTokenizer()
    clean = []
    
    for sentence in text_arr:
        tokens = tokenizer.tokenize(sentence)
        list_sentence = []
        for word in tokens:
            word = word.strip('#').lower()
            if word.isalpha() and not word in stop_words:
                list_sentence.append(word)
        clean.append(list_sentence)
    return clean

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def macro_soft_f1(y, y_hat):

    y = tf.cast(y, tf.float32)
    y_hat = tf.cast(y_hat, tf.float32)
    tp = tf.reduce_sum(y_hat * y, axis=0)
    fp = tf.reduce_sum(y_hat * (1 - y), axis=0)
    fn = tf.reduce_sum((1 - y_hat) * y, axis=0)
    soft_f1 = 2*tp / (2*tp + fn + fp + 1e-16)
    cost = 1 - soft_f1 # reduce 1 - soft-f1 in order to increase soft-f1
    macro_cost = tf.reduce_mean(cost) # average on all labels
    
    return macro_cost

In [None]:
df_train = pd.read_csv('../train.csv')
x_train_str = clean_text([each for each in df_train['text']])
y_train = np.asarray([each for each in df_train['target']])

df_test = pd.read_csv('../test.csv')
x_test_str = clean_text([each for each in df_test['text']])

t = text.Tokenizer(num_words=max_features)
t.fit_on_texts(x_train_str)
x_train_seq = t.texts_to_sequences(x_train_str)
x_test_seq = t.texts_to_sequences(x_test_str)

x_train = sequence.pad_sequences(x_train_seq, maxlen=maxlen, padding='post', truncating='post')
x_test = sequence.pad_sequences(x_test_seq, maxlen=maxlen, padding='post', truncating='post')

In [None]:
l2 = None if l2_loss_lambda is None else regularizers.l2(l2_loss_lambda)

model = Sequential()

model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.2))

model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 kernel_regularizer=l2,
                 strides=1))
model.add(GlobalMaxPooling1D())

model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

model.add(Dense(1))
model.add(Activation('sigmoid'))

In [None]:
optimizer = Adam(lr=0.0001, amsgrad=True)

model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy', f1_m])

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.1)

In [None]:
## test

sample_submission = pd.read_csv("../sample_submission.csv")
sample_submission["target"] = np.where(model.predict(x_test) > 0.5, 1, 0)
sample_submission.to_csv("submission.csv", index=False)