In [65]:
import tweepy
import numpy as np
import time
import json
import os
import pandas as pd
import re

from datasets import Dataset
from transformers import AutoTokenizer
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification
from transformers import DefaultDataCollator

## Load Data into Pandas DF

In [71]:
## Load data

TRAIN_DATA_PATH = "./full_data/data_storage/full_train.json"
Y_TRAIN_ID_PATH = "./id_data/train.label.txt"
X_TRAIN_ID_PATH = "./id_data/train.data.txt"   # train

DEV_DATA_PATH = "./full_data/data_storage/full_dev.json"
Y_DEV_ID_PATH = "./id_data/dev.label.txt"
X_DEV_ID_PATH = "./id_data/dev.data.txt"   # dev

In [31]:
def convert_replies_id_to_sorted_text(list_of_id, data_df):
    """Convert a list of tweet ids to a list of its corresponding text in data_df
    Sort by tweets original create time"""
    if type(list_of_id) is not list:
        # if input is a single id convert type
        list_of_id = [list_of_id]
    # Select dataframe based on a list of ids
    selected_df = data_df[data_df['id'].astype(str).isin(list_of_id)]
    # Sort by their created time
    sorted_df = selected_df.sort_values(by = ["created_at"])
    # Select text fields only
    text_list = sorted_df["text"].to_list()
    return text_list


def remove_URL(original):
    """Remove url link in the text"""
    result = re.sub(r"http\S+", "", original)
    result = re.sub(r"www.\S+", "", result)
    result = re.sub(r"wasap.my+", "", result)
    return result

def join_data_id_label_v2(LABEL_PATH, ID_PATH, DATA_PATH):
    # Process Labels
    if LABEL_PATH:
        # 1: Rumour
        # 0: NonRumour
        with open(LABEL_PATH, "r") as f:
            y_label = f.read().strip().split("\n") # remove next line
        y_label = pd.DataFrame(y_label, columns = ["label"])
        y_label[y_label["label"]=="rumour"] = 1
        y_label[y_label["label"]=="nonrumour"] = 0

    ## Get Dataframe Id, with first id as source Id, and values as replies, not using dict since we have duplicated keys
    total_id_list = []
    with open(ID_PATH, "r") as f:
        for line in f:
            line = line.strip().split(',') # remove next line
            source_id = line[0]
            if len(line) > 1:
                # if we have replies id
                replies_id = line[1:]
            else:
                replies_id = []
            row = [source_id, replies_id]
            total_id_list.append(row)
    len(total_id_list)
    
    ## Create a dataframe containing a list of replies
    source_df = pd.DataFrame(total_id_list, columns = ['source_id', 'replies_id'])
    data_df = pd.read_json(DATA_PATH)
    source_df["reply_text_list"] = source_df["replies_id"].apply(convert_replies_id_to_sorted_text, data_df = data_df)
    source_df["source_text"] = source_df["source_id"].apply(convert_replies_id_to_sorted_text, data_df = data_df)

    if LABEL_PATH:
      source_df["label"] = y_label
    return source_df

def bert_preprocess(IDS, DATA, LABELS=False):
    """Function to combine all the preprocessing steps"""
    data = join_data_id_label_v2(LABELS, IDS, DATA)
    ## 1. Only keep english tweets as most of them are in english
    # Use only text data and remove URLs
    data["source"] = data["source_text"].apply(" ".join).apply(remove_URL) 
    data["replies"] = data["reply_text_list"].apply(" ".join).apply(remove_URL)
    
    
    if LABELS:
      data = data[['source','replies','label']]
    else:
      data = data[['source','replies']]
    
    return data

In [62]:
## Keep only source text as a column AND concatenated reply strings as another column
data_train = bert_preprocess(X_TRAIN_ID_PATH, TRAIN_DATA_PATH, LABELS=Y_TRAIN_ID_PATH)
data_train

Unnamed: 0,source,replies,label
0,5. Can regularly rinsing your nose with saline...,4. Can eating garlic help prevent infection wi...,0
1,French police chief killed himself after #Char...,@Telegraph How very sad. @Telegraph @Telegraph...,1
2,Coronavirus disease (COVID-19) advice for the ...,Infection control for suspected or confirmed C...,0
3,Ottawa police confirm that there were multiple...,@WSJ Killers go berserk when cornered. Hencef...,0
4,if the primary focus of a government isn't to ...,,0
...,...,...,...
1890,Desperate Ted Cruz Claims Planned Parenthood S...,@Bipartisanism \nDesperate! @Bipartisanism Cr...,1
1891,"""Thoughts and prayers are not enough."" Pres. O...",.@ABC has anyone else noticed mass shootings s...,1
1892,Police have surrounded this building where the...,@NBCNews bury them in their hole @NBCNews @Wik...,0
1893,,@Kirstenjoyweiss @MattFabrication @prestone85 ...,0


In [72]:
## Keep only source text as a column AND concatenated reply strings as another column
data_dev = bert_preprocess(X_DEV_ID_PATH, DEV_DATA_PATH, LABELS=Y_DEV_ID_PATH)
data_dev

Unnamed: 0,source,replies,label
0,COVID-19 Fact:\nAre hand dryers effective in k...,"@WeatherBug They are, in fact, germ-breeding f...",0
1,@atruchecks when can we expect the result of m...,@ewart_lynne @atruchecks Hi have you had any l...,0
2,How does COVID-19 spread? \n\nPeople can catch...,I've read a lot about Corona virus lately and ...,0
3,"every news outlet using headlines like,\n\n""ar...","@TuckyAalto Apparently, when a headline is a q...",0
4,Researcher @naskrecki on his encounter with a ...,@Harvard @naskrecki eu tenho uma dessas em cas...,0
...,...,...,...
627,"or cure for COVID-19. However, there are sever...",WHAT ARE THE TREATMENT OPTIONS FOR COVID-19 (I...,0
628,"After speculation that he’s been arrested, Ban...",@artnet @xklamation there was a story saying s...,1
629,*Your questions answered*❓\n\n*Reply with the ...,s?\n\n14. Can I catch COVID-19 from infected s...,0
630,"►#Anonymous Operation #KKK ►Ku Klux Klan, We n...",@AnonymousVideo @grannyrosie3 @AnonymousVideo...,1


## Convert Pandas Dataframe to Transformer DataSet

In [68]:
dataset_train = Dataset.from_pandas(data_train)
dataset_train

Dataset({
    features: ['source', 'replies', 'label'],
    num_rows: 1895
})

In [73]:
dataset_dev = Dataset.from_pandas(data_dev)
dataset_dev

Dataset({
    features: ['source', 'replies', 'label'],
    num_rows: 632
})

## Transformer Dataset to TensorFlow TF Dataset

Makin use of Transformer Dataset

In [74]:
## Set up Tokenizer


tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

## Tokenize with two sentences separated by [SEP]m, use source and reply as two sentences
def tokenize_function(dataset):
    return tokenizer(dataset["source"], dataset["replies"], padding="max_length", truncation=True)

# Set up Train data
tokenized_train_datasets = dataset_train.map(tokenize_function, batched=True)
# Set up Dev Data
tokenized_dev_datasets = dataset_dev.map(tokenize_function, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [70]:
## Use data_collator to batch the dataset
data_collator = DefaultDataCollator(return_tensors="tf")

In [75]:
tf_train_dataset = tokenized_train_datasets.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_validation_dataset = tokenized_dev_datasets.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

## Tensor Flow Bert Model

In [78]:


## Define Model
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

## Set up optimisation method, minimise which loss
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [79]:
# Fit model
model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)


Epoch 1/3
  3/236 [..............................] - ETA: 57:06 - loss: 3.7896 - sparse_categorical_accuracy: 0.2083

KeyboardInterrupt: 

In [35]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [None]:
columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,