In [1]:
import re, tensorflow.keras, os
import pandas as pd, keras, io
import numpy as np
from tensorflow.keras.layers import Dense, Input,Dropout, Embedding, LSTM, Bidirectional, Activation
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, AveragePooling1D, TimeDistributed, GlobalMaxPooling1D, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import load_model
from sklearn.metrics import f1_score,accuracy_score
from bpe import Encoder

Using TensorFlow backend.


In [None]:
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')

In [2]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ['MKL_NUM_THREADS'] = '4'
os.environ['GOTO_NUM_THREADS'] = '4'
os.environ['OMP_NUM_THREADS'] = '4'

In [46]:
MAX_WORD_LEN=150 
MAX_NB_WORDS=50000
EMBEDDING_DIM=300
NUM_CLASS=2
PATIENCE = 20
ITERATIONS = 100
BATCH_SIZE = 100
import tensorflow as tf

def load_vectors(fname, word_index):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        if word_index.get(tokens[0],-1) != -1:
            data[tokens[0]] = list(map(float, tokens[1:]))
    return data


def model_with_fasttext(x_train, y_train, x_dev, y_dev, x_test, y_test, tokenizer):
    word_index = tokenizer.word_index
    nb_words = min(MAX_NB_WORDS, len(word_index))
    print('Total words in dict:', nb_words)
    embeddings = load_vectors('../cc.id.300.vec', word_index)
    embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i > MAX_NB_WORDS:
            continue
        embedding_vector = embeddings.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        else:
            embedding_matrix[i] = np.random.normal(-4.2, 4.2, EMBEDDING_DIM)

    # MODEL 
    # your code here
    with tf.device('/gpu:0'):
        embedding_layer = Embedding(nb_words + 1, EMBEDDING_DIM, weights=[embedding_matrix], 
                                input_length=MAX_WORD_LEN, trainable=False)
        
        tweet = Input(shape=(MAX_WORD_LEN,), dtype='int32')
        embedded_sequences = embedding_layer(tweet)

        lstm_cell = LSTM(units=200, activation='tanh', recurrent_activation='hard_sigmoid', 
                recurrent_regularizer=keras.regularizers.l2(0.2), return_sequences=False, dropout=0.3, recurrent_dropout=0.3)
        doc_vector = Bidirectional(lstm_cell, merge_mode='concat')(embedded_sequences)
        
        
        sign = Dense(NUM_CLASS, activation='softmax')(doc_vector)
        sent_model = Model([tweet], [sign])
        sent_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

        bestf1=0.0; patience = 0
        for i in range(ITERATIONS):
            if patience is PATIENCE:
                break
            sent_model.fit([x_train], [y_train], batch_size=BATCH_SIZE, 
                       epochs=1, shuffle=True, verbose=False)
            prediction=sent_model.predict([x_dev], batch_size=1000)
            predicted_label = np.argmax(prediction,axis=1)
            f1score = f1_score(y_dev,predicted_label)
            if f1score > bestf1:
                print('Epoch ' + str(i) +' with dev f1: '+ str(f1score))
                bestf1 = f1score
                sent_model.save('save.keras')
                patience = 0
            else:
                patience += 1
        sent_model = load_model('save.keras')
        prediction=sent_model.predict([x_test], batch_size=1000)
        predicted_label = np.argmax(prediction,axis=1)
    f1score = f1_score(y_test,predicted_label)
    print('Test F1: ',f1score)
    print('-----------------------------------------------------------------------------------')
    return f1score


def model(x_train, y_train, x_dev, y_dev, x_test, y_test, tokenizer):
    word_index = tokenizer.word_index
    nb_words = min(MAX_NB_WORDS, len(word_index))
    print('Total words in dict:', nb_words)
    embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_matrix[i] = np.random.normal(-4.2, 4.2, EMBEDDING_DIM)

    embedding_layer = Embedding(nb_words + 1, EMBEDDING_DIM, weights=[embedding_matrix], 
                                input_length=MAX_WORD_LEN, trainable=False)
    
    # MODEL 
    with tf.device('/gpu:0'):
        tweet = Input(shape=(MAX_WORD_LEN,), dtype='int32')
        embedded_sequences = embedding_layer(tweet)

        lstm_cell = LSTM(units=200, activation='tanh', recurrent_activation='hard_sigmoid', 
                recurrent_regularizer=keras.regularizers.l2(0.2), return_sequences=False)
        doc_vector = Bidirectional(lstm_cell, merge_mode='concat')(embedded_sequences)

        sign = Dense(NUM_CLASS, activation='softmax')(doc_vector)
        sent_model = Model([tweet], [sign])
        sent_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

        bestf1=0.0; patience = 0
        for i in range(ITERATIONS):
            if patience is PATIENCE:
                break
            sent_model.fit([x_train], [y_train], batch_size=BATCH_SIZE, 
                       epochs=1, shuffle=True, verbose=False)
            prediction=sent_model.predict([x_dev], batch_size=1000)
            predicted_label = np.argmax(prediction,axis=1)
            f1score = f1_score(y_dev,predicted_label)
            if f1score > bestf1:
                print('Epoch ' + str(i) +' with dev f1: '+ str(f1score))
                bestf1 = f1score
                sent_model.save('save.keras')
                patience = 0
            else:
                patience += 1
        sent_model = load_model('save.keras')
        prediction=sent_model.predict([x_test], batch_size=1000)
        predicted_label = np.argmax(prediction,axis=1)
    f1score = f1_score(y_test,predicted_label)
    print('Test F1: ',f1score)
    print('-----------------------------------------------------------------------------------')
    return f1score

def train_and_test_fasttext(x_train, y_train, x_dev, y_dev, x_test, y_test):    
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True)
    tokenizer.fit_on_texts(x_train)
    
    x_train = tokenizer.texts_to_sequences(x_train)
    x_test = tokenizer.texts_to_sequences(x_test)
    x_dev = tokenizer.texts_to_sequences(x_dev)
    
    max_len = max([len(t) for t in x_train])
    print ('Max Len', max_len)
    max_len = MAX_WORD_LEN
    x_train = sequence.pad_sequences(x_train, maxlen=max_len, padding='post')
    x_test = sequence.pad_sequences(x_test, maxlen=max_len, padding='post')
    x_dev = sequence.pad_sequences(x_dev, maxlen=max_len, padding='post')
    return model_with_fasttext(x_train, to_categorical(y_train), x_dev, y_dev, x_test, y_test, tokenizer)

def train_and_test(x_train, y_train, x_dev, y_dev, x_test, y_test):   
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(x_train)
    
    x_train = tokenizer.texts_to_sequences(x_train)
    x_test = tokenizer.texts_to_sequences(x_test)
    x_dev = tokenizer.texts_to_sequences(x_dev)
    
    max_len = max([len(t) for t in x_train])
    print ('Max Len', max_len)
    #assert max_len < MAX_WORD_LEN
    max_len = MAX_WORD_LEN
    x_train = sequence.pad_sequences(x_train, maxlen=max_len, padding='post')
    x_test = sequence.pad_sequences(x_test, maxlen=max_len, padding='post')
    x_dev = sequence.pad_sequences(x_dev, maxlen=max_len, padding='post')
    return model(x_train, to_categorical(y_train), x_dev, y_dev, x_test, y_test, tokenizer)

def train_and_test_bpe(x_train, y_train, x_dev, y_dev, x_test, y_test):
    encoder = Encoder(30000, pct_bpe=0.5)  # params chosen for demonstration purposes
    encoder.fit(xtrain)
    x_train = [' '.join(encoder.tokenize(name)) for name in x_train]
    x_test = [' '.join(encoder.tokenize(name)) for name in x_test]
    x_dev = [' '.join(encoder.tokenize(name)) for name in x_dev]
    
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(x_train)
    
    x_train = tokenizer.texts_to_sequences(x_train)
    x_test = tokenizer.texts_to_sequences(x_test)
    x_dev = tokenizer.texts_to_sequences(x_dev)
    
    max_len = max([len(t) for t in x_train])
    print ('Max Len', max_len)
    max_len = MAX_WORD_LEN
    x_train = sequence.pad_sequences(x_train, maxlen=max_len, padding='post')
    x_test = sequence.pad_sequences(x_test, maxlen=max_len, padding='post')
    x_dev = sequence.pad_sequences(x_dev, maxlen=max_len, padding='post')
    return model(x_train, to_categorical(y_train), x_dev, y_dev, x_test, y_test, tokenizer)

In [26]:
print('Batch Size', BATCH_SIZE)
f1s = 0.0
for idx in range(5):
    train = pd.read_csv('train'+str(idx)+'.csv')
    dev = pd.read_csv('dev'+str(idx)+'.csv')
    test = pd.read_csv('test'+str(idx)+'.csv')
    xtrain, ytrain = list(train['sentence']), list(train['sentiment'])
    xdev, ydev = list(dev['sentence']), list(dev['sentiment'])
    xtest, ytest = list(test['sentence']), list(test['sentiment'])
    f1s += train_and_test(xtrain, ytrain, xdev, ydev, xtest, ytest)
print(f1s/5.0)

Batch Size 100
Max Len 139
Total words in dict: 10450
Epoch 0 with dev f1: 0.3837209302325581
Epoch 2 with dev f1: 0.5463917525773195
Epoch 3 with dev f1: 0.6027397260273972
Epoch 4 with dev f1: 0.6293103448275863
Epoch 8 with dev f1: 0.646288209606987
Epoch 10 with dev f1: 0.6576576576576577
Test F1:  0.7241379310344828
-----------------------------------------------------------------------------------
Max Len 133
Total words in dict: 10434
Epoch 0 with dev f1: 0.23611111111111113
Epoch 1 with dev f1: 0.4607329842931937
Epoch 2 with dev f1: 0.5170731707317073
Epoch 3 with dev f1: 0.5739910313901345
Epoch 5 with dev f1: 0.5909090909090909
Epoch 7 with dev f1: 0.701492537313433
Test F1:  0.7128129602356407
-----------------------------------------------------------------------------------
Max Len 139
Total words in dict: 10682
Epoch 0 with dev f1: 0.28187919463087246
Epoch 1 with dev f1: 0.3647798742138365
Epoch 2 with dev f1: 0.5688888888888889
Epoch 5 with dev f1: 0.7063829787234043
E

In [32]:
print('Batch Size', BATCH_SIZE)
f1s = 0.0
for idx in range(5):
    train = pd.read_csv('train'+str(idx)+'.csv')
    dev = pd.read_csv('dev'+str(idx)+'.csv')
    test = pd.read_csv('test'+str(idx)+'.csv')
    xtrain, ytrain = list(train['sentence']), list(train['sentiment'])
    xdev, ydev = list(dev['sentence']), list(dev['sentiment'])
    xtest, ytest = list(test['sentence']), list(test['sentiment'])
    f1s += train_and_test_bpe(xtrain, ytrain, xdev, ydev, xtest, ytest)
print(f1s/5.0)

Batch Size 100
Max Len 139
Total words in dict: 10406
Epoch 0 with dev f1: 0.4157303370786517
Epoch 1 with dev f1: 0.5560538116591928
Epoch 2 with dev f1: 0.5714285714285714
Epoch 3 with dev f1: 0.5803108808290156
Epoch 4 with dev f1: 0.631578947368421
Epoch 5 with dev f1: 0.6574074074074073
Epoch 16 with dev f1: 0.6575342465753424
Epoch 17 with dev f1: 0.6605504587155964
Epoch 19 with dev f1: 0.6757990867579907
Test F1:  0.7001733102253033
-----------------------------------------------------------------------------------
Max Len 133
Total words in dict: 10391
Epoch 0 with dev f1: 0.44198895027624313
Epoch 2 with dev f1: 0.6375545851528384
Epoch 6 with dev f1: 0.6554621848739496
Epoch 7 with dev f1: 0.689655172413793
Test F1:  0.728171334431631
-----------------------------------------------------------------------------------
Max Len 139
Total words in dict: 10630
Epoch 0 with dev f1: 0.32499999999999996
Epoch 1 with dev f1: 0.5676855895196506
Epoch 2 with dev f1: 0.5816326530612245


In [None]:
print('Batch Size', BATCH_SIZE)
f1s = 0.0
for idx in range(5):
    train = pd.read_csv('train'+str(idx)+'.csv')
    dev = pd.read_csv('dev'+str(idx)+'.csv')
    test = pd.read_csv('test'+str(idx)+'.csv')
    xtrain, ytrain = list(train['sentence']), list(train['sentiment'])
    xdev, ydev = list(dev['sentence']), list(dev['sentiment'])
    xtest, ytest = list(test['sentence']), list(test['sentiment'])
    f1s += train_and_test_fasttext(xtrain, ytrain, xdev, ydev, xtest, ytest)
print(f1s/5.0)


Batch Size 100
Max Len 139
Total words in dict: 10450
Epoch 0 with dev f1: 0.06611570247933884
Epoch 1 with dev f1: 0.128
Epoch 2 with dev f1: 0.32214765100671144
Epoch 3 with dev f1: 0.44025157232704404
Epoch 4 with dev f1: 0.5333333333333333
Epoch 5 with dev f1: 0.5882352941176471
Epoch 7 with dev f1: 0.6339285714285714
Epoch 11 with dev f1: 0.6442307692307693
Epoch 13 with dev f1: 0.6457399103139014
Epoch 18 with dev f1: 0.6869565217391305
Epoch 35 with dev f1: 0.6972477064220184
Test F1:  0.7027972027972028
-----------------------------------------------------------------------------------
Max Len 133
Total words in dict: 10434
Epoch 0 with dev f1: 0.04838709677419355
Epoch 1 with dev f1: 0.07874015748031497
Epoch 2 with dev f1: 0.24161073825503354
Epoch 3 with dev f1: 0.34782608695652173
Epoch 4 with dev f1: 0.3757575757575758
Epoch 5 with dev f1: 0.55
Epoch 6 with dev f1: 0.5656565656565656
Epoch 7 with dev f1: 0.5951219512195123
Epoch 12 with dev f1: 0.7107438016528925
Test F1: 