In [1]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [2]:
cd /content/drive/My Drive/OLID

/content/drive/My Drive/OLID


In [3]:
ls

dev.csv            labels-levela.csv       testset-levela.tsv
glove.6B.100d.txt  olid-training-v1.0.tsv  train.csv
glove.6B.200d.txt  test.csv


In [4]:
import re
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from gensim import downloader

nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [13]:
def embed(tweet):
    tweet = tweet.lower()
    tweet = re.sub("@user", "", tweet)
    tweet = re.sub(r"[^A-Za-z]", " ", tweet)
    tweet = re.sub("url", "", tweet) # remove url
    tokens = tweet.split(" ")
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    tokens = [token for token in tokens if token and not token in stop_words]
    tokens=" ".join(tokens)
    return tokens


In [43]:
# Load train data
train_df = pd.read_csv('/content/drive/MyDrive/OLID/train.csv')
train_data, y_train = train_df["tweet"], train_df["class"]
# load dev data
dev_df = pd.read_csv('/content/drive/MyDrive/OLID/dev.csv')
dev_data, y_dev = dev_df["tweet"], dev_df["class"]
# load test data
test_df = pd.read_csv('/content/drive/MyDrive/OLID/test.csv')
test_data, y_test = test_df["tweet"], test_df["class"]

# Constructing the vectorized features
X_train = train_data.apply(lambda x: embed(x))
X_dev = dev_data.apply(lambda x: embed(x))
X_test = test_data.apply(lambda x: embed(x))

In [44]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
max_words = 10000 #frequency of words to be kept
max_len = 200

tokenize = Tokenizer(num_words=max_words)
tokenize.fit_on_texts(X_train)
sequences = tokenize.texts_to_sequences(X_train)
word_index = tokenize.word_index
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [45]:
import os
embeddings_index = {}
f = open('glove.6B.200d.txt',encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [46]:
num_words = min(max_words, len(word_index)) + 1
print(num_words)

embedding_dim = 200

# first create a matrix of zeros, this is our embedding matrix
embedding_matrix = np.zeros((num_words, embedding_dim))

# for each word in out tokenizer lets try to find that work in our w2v model
for word, i in word_index.items():
    if i > max_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # we found the word - add that words vector to the matrix
        embedding_matrix[i] = embedding_vector
    else:
        # doesn't exist, assign a random vector
        embedding_matrix[i] = np.random.randn(embedding_dim)

10001


In [57]:
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,Dropout,Bidirectional,GlobalMaxPool1D,GlobalAveragePooling1D, SpatialDropout1D
from keras.optimizers import RMSprop
from keras.initializers import Constant
from sklearn.utils import class_weight
import tensorflow as tf

model = Sequential()
model.add(Embedding(num_words,
                    embedding_dim,
                    embeddings_initializer=Constant(embedding_matrix),
                    input_length=max_len,
                    trainable=True))
model.add((LSTM(64, return_sequences = True)))
model.add(GlobalMaxPool1D())

model.add(Dropout(0.5))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])


class_weights = class_weight.compute_class_weight('balanced',np.unique(y_train),y_train)
class_weights=dict(enumerate(class_weights))
model.fit(sequences_matrix,y_train,batch_size=128,epochs=5,verbose=2,class_weight=class_weights)

Epoch 1/5
94/94 - 4s - loss: 0.6166 - accuracy: 0.6694
Epoch 2/5
94/94 - 2s - loss: 0.5368 - accuracy: 0.7581
Epoch 3/5
94/94 - 2s - loss: 0.5002 - accuracy: 0.7807
Epoch 4/5
94/94 - 2s - loss: 0.4663 - accuracy: 0.8018
Epoch 5/5
94/94 - 2s - loss: 0.4396 - accuracy: 0.8158


<tensorflow.python.keras.callbacks.History at 0x7fb8d4067190>

In [58]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 200, 200)          2000200   
_________________________________________________________________
lstm_4 (LSTM)                (None, 200, 64)           67840     
_________________________________________________________________
global_max_pooling1d (Global (None, 64)                0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 2,068,105
Trainable params: 2,068,105
Non-trainable params: 0
_________________________________________________________________


In [100]:
dev_sequences = tokenize.texts_to_sequences(X_dev)
dev_sequences_matrix = sequence.pad_sequences(dev_sequences,maxlen=max_len)
print(model.evaluate(dev_sequences_matrix,y_dev))

test_sequences = tokenize.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)
print(model.evaluate(test_sequences_matrix,y_test))

[0.5433407425880432, 0.7432023882865906]
[0.4636443853378296, 0.7872093319892883]


In [102]:
from sklearn.metrics import classification_report
y_pred_dev = model.predict(dev_sequences_matrix, batch_size=128, verbose=1)
y_pred_bool = np.argmax(y_pred_dev, axis=1)
y_pred_dev = np.where(y_pred_dev > 0.5, 1, 0)
print(classification_report(y_dev, y_pred_dev))


              precision    recall  f1-score   support

           0       0.61      0.67      0.64       444
           1       0.82      0.78      0.80       880

    accuracy                           0.74      1324
   macro avg       0.71      0.72      0.72      1324
weighted avg       0.75      0.74      0.75      1324



In [103]:
y_pred_test = model.predict(test_sequences_matrix, batch_size=128, verbose=1)
y_pred_bool = np.argmax(y_pred_test, axis=1)
y_pred_test = np.where(y_pred_test > 0.5, 1, 0)

print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.61      0.67      0.64       240
           1       0.87      0.83      0.85       620

    accuracy                           0.79       860
   macro avg       0.74      0.75      0.74       860
weighted avg       0.79      0.79      0.79       860



In [113]:
# output test_labels.txt
np.savetxt("{}.txt".format('lstm_dev_preds'), y_pred_dev, fmt='%i', newline="\n")
np.savetxt("{}.txt".format('lstm_test_preds'), y_pred_test, fmt='%i', newline="\n")

In [None]:
import keras
keras.backend.clear_session()