TRY:
converting LSTM to Bidirectional layer \
changing the loss function\
changing the embedding size?\

In [1]:
import pandas as pd
# Train data
DATA_ID_PATH           = "train.data.txt"
Y_TRAIN_ID_PATH        = "train.label.txt"
FULL_TWITTER_PATH      = "full_train.json"

# Dev data
DEV_DATA_ID_PATH       = "dev.data.txt"
DEV_Y_TRAIN_ID_PATH    = "dev.label.txt"
DEV_FULL_TWITTER_PATH  = "full_dev.json"

# Test data
TEST_DATA_ID_PATH      = "test.data.txt"
TEST_FULL_TWITTER_PATH = "test_data.json"

In [2]:
def convert_replies_id_to_sorted_text(list_of_id, data_df):
    """Convert a list of tweet ids to a list of its corresponding text in data_df
    Sort by tweets original create time"""
    if type(list_of_id) is not list:
        # if input is a single id convert type
        list_of_id = [list_of_id]
    # Select dataframe based on a list of ids
    selected_df = data_df[data_df['id'].astype(str).isin(list_of_id)]
    # Sort by their created time
    sorted_df = selected_df.sort_values(by = ["created_at"])
    # Select text fields only
    text_list = sorted_df["text"].to_list()
    return text_list

In [3]:
def join_data_id_label_v2(LABEL_PATH, ID_PATH, DATA_PATH):
    # Process Labels
    if LABEL_PATH:
        # 1: Rumour
        # 0: NonRumour
        with open(LABEL_PATH, "r") as f:
            y_label = f.read().strip().split("\n") # remove next line
        y_label = pd.DataFrame(y_label, columns = ["label"])
        y_label[y_label["label"]=="rumour"] = 1
        y_label[y_label["label"]=="nonrumour"] = 0

    ## Get Dataframe Id, with first id as source Id, and values as replies, not using dict since we have duplicated keys
    total_id_list = []
    with open(ID_PATH, "r") as f:
        for line in f:
            line = line.strip().split(',') # remove next line
            source_id = line[0]
            if len(line) > 1:
                # if we have replies id
                replies_id = line[1:]
            else:
                replies_id = []
            row = [source_id, replies_id]
            total_id_list.append(row)
    len(total_id_list)
    
    ## Create a dataframe containing a list of replies
    source_df = pd.DataFrame(total_id_list, columns = ['source_id', 'replies_id'])
    data_df = pd.read_json(DATA_PATH)
    source_df["reply_text_list"] = source_df["replies_id"].apply(convert_replies_id_to_sorted_text, data_df = data_df)
    source_df["source_text"] = source_df["source_id"].apply(convert_replies_id_to_sorted_text, data_df = data_df)

    if LABEL_PATH:
      source_df["label"] = y_label
    return source_df

In [4]:
import re
def remove_URL(original):
    """Remove url link in the text"""
    result = re.sub(r"http\S+", "", original)
    result = re.sub(r"www.\S+", "", result)
    result = re.sub(r"wasap.my+", "", result)
    return result

In [5]:
def preprocess(IDS, DATA, LABELS=False):
    """Function to combine all the preprocessing steps"""
    data = join_data_id_label_v2(LABELS, IDS, DATA)
    ## 1. Only keep english tweets as most of them are in english
    # Use only text data and remove URLs
    
    data["text"] = data["source_text"].apply("".join) + " " + data["reply_text_list"].apply(" ".join)
    data["sentence"] = data["text"].apply(remove_URL)
    if LABELS:
      data = data[['sentence','label']]
    else:
      data = data[['sentence']]
    
    return data

In [6]:
data_train = preprocess(DATA_ID_PATH, FULL_TWITTER_PATH, LABELS=Y_TRAIN_ID_PATH)
data_train

Unnamed: 0,sentence,label
0,5. Can regularly rinsing your nose with saline...,0
1,French police chief killed himself after #Char...,1
2,Coronavirus disease (COVID-19) advice for the ...,0
3,Ottawa police confirm that there were multiple...,0
4,if the primary focus of a government isn't to ...,0
...,...,...
1890,Desperate Ted Cruz Claims Planned Parenthood S...,1
1891,"""Thoughts and prayers are not enough."" Pres. O...",1
1892,Police have surrounded this building where the...,0
1893,@Kirstenjoyweiss @MattFabrication @prestone85...,0


In [7]:
data_dev = preprocess(DEV_DATA_ID_PATH, DEV_FULL_TWITTER_PATH, LABELS=DEV_Y_TRAIN_ID_PATH)
data_dev

Unnamed: 0,sentence,label
0,COVID-19 Fact:\nAre hand dryers effective in k...,0
1,@atruchecks when can we expect the result of m...,0
2,How does COVID-19 spread? \n\nPeople can catch...,0
3,"every news outlet using headlines like,\n\n""ar...",0
4,Researcher @naskrecki on his encounter with a ...,0
...,...,...
627,"or cure for COVID-19. However, there are sever...",0
628,"After speculation that he’s been arrested, Ban...",1
629,*Your questions answered*❓\n\n*Reply with the ...,0
630,"►#Anonymous Operation #KKK ►Ku Klux Klan, We n...",1


In [60]:
data_test = preprocess(TEST_DATA_ID_PATH, TEST_FULL_TWITTER_PATH)
data_test

469

In [33]:
# Run this cell and not the one below if you want to evaluate on dev set
"""
import random
import numpy as np

sentences_train_data = data_train['sentence'].values
labels_train = data_train['label'].values

sentences_dev_data   = data_dev['sentence'].values
labels_dev = data_dev['label'].values

#partition data for train/dev/test
sentences_train, y_train = sentences_train_data[:1500], labels_train[:1500]
sentences_dev, y_dev = sentences_train_data[1500:], labels_train[1500:]
sentences_test, y_test = sentences_dev_data, labels_dev


#convert label list into arrays
y_train = np.array(y_train).astype('float32')
y_dev = np.array(y_dev).astype('float32')
y_test = np.array(y_test).astype('float32')
"""

In [40]:
# Run this cell and not the one above if you want to make final predictions
import random
import numpy as np
sentences_train = data_train['sentence'].values
y_train         = data_train['label'].values

sentences_dev   = data_dev['sentence'].values
y_dev           = data_dev['label'].values

sentences_test  = data_test['sentence'].values

#convert label list into arrays
y_train = np.array(y_train).astype('float32')
y_dev = np.array(y_dev).astype('float32')

In [41]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts(sentences_train)

vocab_size = tokenizer.texts_to_matrix(sentences_train, mode="count").shape[1]

print("Vocab size =", vocab_size)

Vocab size = 31186


In [42]:
from keras.models import Sequential
from keras import layers
from keras.preprocessing.sequence import pad_sequences

maxlen = 50
# Sequence tokenization
xseq_train = tokenizer.texts_to_sequences(sentences_train)
xseq_dev = tokenizer.texts_to_sequences(sentences_dev)
xseq_test = tokenizer.texts_to_sequences(sentences_test)
# Padding
xseq_train = pad_sequences(xseq_train, padding='post', maxlen=maxlen)
xseq_dev = pad_sequences(xseq_dev, padding='post', maxlen=maxlen)
xseq_test = pad_sequences(xseq_test, padding='post', maxlen=maxlen)

In [43]:
embedding_dim = 10

from keras.layers import LSTM

#word order preserved with this architecture
lstm = Sequential(name="lstm")
lstm.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
lstm.add(LSTM(10))
lstm.add(layers.Dense(1, activation='sigmoid'))
lstm.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
lstm.summary()

Model: "lstm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 50, 10)            311860    
                                                                 
 lstm_5 (LSTM)               (None, 10)                840       
                                                                 
 dense_5 (Dense)             (None, 1)                 11        
                                                                 
Total params: 312,711
Trainable params: 312,711
Non-trainable params: 0
_________________________________________________________________


In [44]:
lstm.fit(xseq_train, y_train, epochs=15, verbose=True, validation_data=(xseq_dev, y_dev), batch_size=10)

#loss, accuracy = lstm.evaluate(xseq_test, y_test, verbose=False)
#print("Testing Accuracy:  {:.4f}".format(accuracy))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7efc9eed1d90>

In [68]:
predictions = lstm.predict(xseq_test)
classes = (predictions > 0.5).astype("int32")

In [49]:
# generate the csv for prediction
def generate_csv(pred, csv_name):
    ids = pd.Index(range(len(pred)), name='Id')
    predictions = pd.DataFrame(pred, index=ids)
    predictions.columns = ['Predicted']
    predictions.to_csv(csv_name)

In [50]:
assert(len(classes)==558)
generate_csv(classes, "./LSTMPred1")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectorizer.fit(sentences_train)

xtf_train = vectorizer.transform(sentences_train)
xtf_dev = vectorizer.transform(sentences_dev)
xtf_test = vectorizer.transform(sentences_test)

# Padding
xtf_train = pad_sequences(xseq_train, padding='post', maxlen=maxlen)
xtf_dev = pad_sequences(xseq_dev, padding='post', maxlen=maxlen)
xtf_test = pad_sequences(xseq_test, padding='post', maxlen=maxlen)

In [None]:
lstm.fit(xtf_train, y_train, epochs=10, verbose=True, validation_data=(xtf_dev, y_dev), batch_size=10)

loss, accuracy = lstm.evaluate(xtf_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Testing Accuracy:  0.7744
