### Installation


In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.4-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 4.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 44.2 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 4.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 49.9 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstallin

### Upload files

In [4]:
from google.colab import files
uploaded = files.upload()

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Imports


In [6]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import collections
import re
import networkx
import numpy as np
import ast
import nltk
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from gensim.models.phrases import Phrases
from wordcloud import WordCloud

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC


import time

In [7]:
total_filename = '/content/upd_raw_data.csv'
ROBERTA_DISINFORMATION = '/content/drive/MyDrive/Masters/Thesis/Code/Model V2/roberta-fil-base.h5'

### Loading DF

In [8]:
orig_df = pd.read_csv(total_filename, encoding="utf-8")
orig_df = orig_df.drop(columns=[orig_df.columns[3]])

In [9]:
orig_df

Unnamed: 0,rating,claim description,content
0,FALSE,Nostradamus predicts Bongbong Marcos will lead...,
1,FALSE,Table shows when COVID-19 variations will be r...,
2,FALSE,Table shows when COVID-19 variations will be r...,
3,FALSE,Ibinasura na ng Comelec lahat ng kaso ng diska...,
4,FALSE,Gabby Lopez sold all his shares in ABS-CBN Cor...,"Alam nyo ba? Si Eugenio ""Gabby"" Lopez III, las..."
...,...,...,...
2096,ALTERED_MEDIA,Barangay captain slams man to wall for not wea...,
2097,ALTERED PHOTO,"Nun Mary John Mananzan says Aquinos corrupt, a...",
2098,ALTERED PHOTO,Inquirer post on travel suspension in March 2021,
2099,ALTERED PHOTO,Mark Zuckerberg in Liberal Party ad,


In [10]:
orig_indexes = orig_df.index
for i in orig_indexes:
  if pd.isna(orig_df['claim description'].loc[i]):
    orig_df.at[i,'claim description'] = orig_df['content']

orig_df = orig_df.drop(columns=['content','rating'])
orig_df = orig_df.rename(columns={'claim description':'tweet'})
orig_df

Unnamed: 0,tweet
0,Nostradamus predicts Bongbong Marcos will lead...
1,Table shows when COVID-19 variations will be r...
2,Table shows when COVID-19 variations will be r...
3,Ibinasura na ng Comelec lahat ng kaso ng diska...
4,Gabby Lopez sold all his shares in ABS-CBN Cor...
...,...
2096,Barangay captain slams man to wall for not wea...
2097,"Nun Mary John Mananzan says Aquinos corrupt, a..."
2098,Inquirer post on travel suspension in March 2021
2099,Mark Zuckerberg in Liberal Party ad


In [11]:
df = orig_df.copy()
df['verdict'] = "FALSE"
df['tweet'] = df['tweet'].astype(str)
df

Unnamed: 0,tweet,verdict
0,Nostradamus predicts Bongbong Marcos will lead...,FALSE
1,Table shows when COVID-19 variations will be r...,FALSE
2,Table shows when COVID-19 variations will be r...,FALSE
3,Ibinasura na ng Comelec lahat ng kaso ng diska...,FALSE
4,Gabby Lopez sold all his shares in ABS-CBN Cor...,FALSE
...,...,...
2096,Barangay captain slams man to wall for not wea...,FALSE
2097,"Nun Mary John Mananzan says Aquinos corrupt, a...",FALSE
2098,Inquirer post on travel suspension in March 2021,FALSE
2099,Mark Zuckerberg in Liberal Party ad,FALSE


# Preprocessing


###Lower case all words

In [12]:
df.tweet = df['tweet'].apply(lambda x: x.lower())
df.tweet

0       nostradamus predicts bongbong marcos will lead...
1       table shows when covid-19 variations will be r...
2       table shows when covid-19 variations will be r...
3       ibinasura na ng comelec lahat ng kaso ng diska...
4       gabby lopez sold all his shares in abs-cbn cor...
                              ...                        
2096    barangay captain slams man to wall for not wea...
2097    nun mary john mananzan says aquinos corrupt, a...
2098     inquirer post on travel suspension in march 2021
2099                  mark zuckerberg in liberal party ad
2100    abs-cbn graphic commemorating marcos as best p...
Name: tweet, Length: 2101, dtype: object

### Segragate COVID-19 Related Tweets

In [13]:
collection_words = ['covid19ph', 'covid-19', 'bakuna', 'resbakuna', 'coronavirus', '#covid19', 'lockdown']
keep = []

for i in df.index:
  if any(word in df['tweet'].loc[i] for word in collection_words):
    keep.append(i)

df = df.iloc[keep]
df = df.reset_index(drop=True)
df

Unnamed: 0,tweet,verdict
0,table shows when covid-19 variations will be r...,FALSE
1,table shows when covid-19 variations will be r...,FALSE
2,media did not report on march 23 covid-19 case...,FALSE
3,'required' covid-19 home medical kit,FALSE
4,canned goods from china contain flesh of covid...,FALSE
...,...,...
231,astrazeneca’s covid-19 vaccine causes blood clots,FALSE
232,vaers data show covid-19 vaccines lead to ecto...,FALSE
233,virus discovered in china nearly identical to ...,FALSE
234,misinformation on novel coronavirus that sprea...,FALSE


### Removal of Stopwords and Links


In [14]:
import nltk
nltk.download('stopwords')
#English Stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop.sort()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [15]:
#English Stopwords
df.tweet = df['tweet'].apply(lambda x: ' '.join([word for word in x.split(" ") if word not in (stop)]))
df.tweet

0               table shows covid-19 variations released
1               table shows covid-19 variations released
2              media report march 23 covid-19 case tally
3                   'required' covid-19 home medical kit
4      canned goods china contain flesh covid-19 victims
                             ...                        
231    astrazeneca’s covid-19 vaccine causes blood clots
232    vaers data show covid-19 vaccines lead ectopic...
233     virus discovered china nearly identical covid-19
234       misinformation novel coronavirus spread online
235          duque claims ph 'low' coronavirus infection
Name: tweet, Length: 236, dtype: object

In [16]:
#Filipino Stopwords
fil_stop = ["akin","aking","ako","alin","am","amin","aming","ang","ano","anumang","apat","at","atin","ating","ay","bababa","bago","bakit","bawat","bilang","dahil","dalawa","dapat","din","dito","doon","gagawin","gayunman","ginagawa","ginawa","ginawang","gumawa","gusto","habang","hanggang","hindi","huwag","iba","ibaba","ibabaw","ibig","ikaw","ilagay","ilalim","ilan","inyong","isa","isang","itaas","ito","iyo","iyon","iyong","ka","kahit","kailangan","kailanman","kami","kanila","kanilang","kanino","kanya","kanyang","kapag","kapwa","karamihan","katiyakan","katulad","kaya","kaysa","ko","kong","kulang","kumuha","kung","laban","lahat","lamang","likod","lima","maaari","maaaring","maging","mahusay","makita","marami","marapat","masyado","may","mayroon","mga","minsan","mismo","mula","muli","na","nabanggit","naging","nagkaroon","nais","nakita","namin","napaka","narito","nasaan","ng","ngayon","ni","nila","nilang","nito","niya","niyang","noon","o","pa","paano","pababa","paggawa","pagitan","pagkakaroon","pagkatapos","palabas","pamamagitan","panahon","pangalawa","para","paraan","pareho","pataas","pero","pumunta","pumupunta","sa","saan","sabi","sabihin","sarili","sila","sino","siya","tatlo","tayo","tulad","tungkol","una","walang",'in']

In [17]:
stop = fil_stop
df.tweet = df['tweet'].apply(lambda x: ' '.join([word for word in x.split(" ") if word not in (fil_stop)]))
df.tweet

0               table shows covid-19 variations released
1               table shows covid-19 variations released
2              media report march 23 covid-19 case tally
3                   'required' covid-19 home medical kit
4      canned goods china contain flesh covid-19 victims
                             ...                        
231    astrazeneca’s covid-19 vaccine causes blood clots
232    vaers data show covid-19 vaccines lead ectopic...
233     virus discovered china nearly identical covid-19
234       misinformation novel coronavirus spread online
235          duque claims ph 'low' coronavirus infection
Name: tweet, Length: 236, dtype: object

In [18]:
#Double check if we need links
df.tweet = df['tweet'].apply(lambda x: " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", x).split()))
df.tweet

0                table shows covid19 variations released
1                table shows covid19 variations released
2               media report march 23 covid19 case tally
3                      required covid19 home medical kit
4       canned goods china contain flesh covid19 victims
                             ...                        
231      astrazenecas covid19 vaccine causes blood clots
232    vaers data show covid19 vaccines lead ectopic ...
233      virus discovered china nearly identical covid19
234       misinformation novel coronavirus spread online
235            duque claims ph low coronavirus infection
Name: tweet, Length: 236, dtype: object

###Removal of Collection Words

In [19]:
collection_words = ['covid19ph', 'covid19', 'bakuna', 'resbakuna', 'coronavirus', '#covid19']

df.tweet = df['tweet'].apply(lambda x: ' '.join([word for word in x.split(" ") if word not in (collection_words)]))
df.tweet

0                        table shows variations released
1                        table shows variations released
2                       media report march 23 case tally
3                              required home medical kit
4               canned goods china contain flesh victims
                             ...                        
231              astrazenecas vaccine causes blood clots
232    vaers data show vaccines lead ectopic pregnancies
233              virus discovered china nearly identical
234                   misinformation novel spread online
235                        duque claims ph low infection
Name: tweet, Length: 236, dtype: object

In [20]:
collection_words = ['in','yung','higit','nang','wala','di','po', 'ba', 'ah', 'lang', 'yan', 'yang','walang', 'kayo', 'niyong', 'rin', 'mo', 'diyan', 'jan', 'nyo', 'e']

df.tweet = df['tweet'].apply(lambda x: ' '.join([word for word in x.split(" ") if word not in (collection_words)]))
df.tweet

0                        table shows variations released
1                        table shows variations released
2                       media report march 23 case tally
3                              required home medical kit
4               canned goods china contain flesh victims
                             ...                        
231              astrazenecas vaccine causes blood clots
232    vaers data show vaccines lead ectopic pregnancies
233              virus discovered china nearly identical
234                   misinformation novel spread online
235                        duque claims ph low infection
Name: tweet, Length: 236, dtype: object

### Label Encoder

In [21]:
def label_encode(df, column_loop):
  #Initialize Label Encoder
  labelencoder = LabelEncoder()
  
  #Values initialized before loop
  label_names = {}

  #Loop through specific columns
  for col in column_loop:
    labelencoder.fit(df[col])
    labelencoder_name_mapping = dict(zip(labelencoder.classes_, labelencoder.transform(labelencoder.classes_)))
    df[col] = labelencoder.fit_transform(df[col])
    label_names[col] = {col:labelencoder_name_mapping}

  return df, label_names

In [22]:
df, label_names = label_encode(df,['verdict'])

### TPU Performance

In [23]:
rsn = 42

In [24]:
import tensorflow as tf

try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

#batch_size=32 * tpu_strategy.num_replicas_in_sync
#batch_size = 64
batch_size = 16
print('Batch size:', batch_size)
AUTOTUNE = tf.data.experimental.AUTOTUNE

BaseException: ignored

### TF Imports

In [25]:
from keras.layers import Dropout, Dense, Embedding, LSTM, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import regularizers
from sklearn.metrics import matthews_corrcoef, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.utils import shuffle
import numpy as np
import pickle
import matplotlib.pyplot as plt
import warnings
import logging
logging.basicConfig(level=logging.INFO)

### Roberta

In [26]:
import transformers
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification,TFRobertaForSequenceClassification, TFRobertaModel

In [27]:
X = df.tweet
y = df.verdict


In [28]:
MAX_LEN = 280
MODEL_NAME = "jcblaise/roberta-tagalog-base"

def roberta_encode(texts, tokenizer):
    ct = len(texts)
    input_ids = np.ones((ct, MAX_LEN), dtype='int32')
    attention_mask = np.zeros((ct, MAX_LEN), dtype='int32')
    token_type_ids = np.zeros((ct, MAX_LEN), dtype='int32') # Not used in text classification

    for k, text in enumerate(texts):
        # Tokenize
        tok_text = tokenizer.tokenize(text)
        
        # Truncate and convert tokens to numerical IDs
        enc_text = tokenizer.convert_tokens_to_ids(tok_text[:(MAX_LEN-2)])
        
        input_length = len(enc_text) + 2
        input_length = input_length if input_length < MAX_LEN else MAX_LEN
        
        # Add tokens [CLS] and [SEP] at the beginning and the end
        input_ids[k,:input_length] = np.asarray([0] + enc_text + [2], dtype='int32')
        
        # Set to 1s in the attention input
        attention_mask[k,:input_length] = 1

    return {
        'input_word_ids': input_ids,
        'input_mask': attention_mask,
        'input_type_ids': token_type_ids
    }

def build_model(n_categories):
    #with tpu_strategy.scope():
  input_word_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_word_ids')
  input_mask = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_mask')
  input_type_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_type_ids')

  # Import RoBERTa model from HuggingFace
  roberta_model = TFRobertaModel.from_pretrained(MODEL_NAME)
  x = roberta_model(input_word_ids, attention_mask=input_mask, token_type_ids=input_type_ids)

  # Huggingface transformers have multiple outputs, embeddings are the first one,
  # so let's slice out the first position
  x = x[0]

  x = tf.keras.layers.Dropout(0.1)(x)
  x = tf.keras.layers.Flatten()(x)
  x = tf.keras.layers.Dense(256, activation='relu')(x)
  x = tf.keras.layers.Dense(n_categories, activation='softmax')(x)

  model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=x)
  model.compile(
      optimizer=tf.keras.optimizers.Adam(lr=1e-5),
      loss='sparse_categorical_crossentropy',
      metrics=['accuracy'])

  return model

In [29]:
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME) #Tokenizer
X_test_tk = roberta_encode(X, tokenizer)

#with tpu_strategy.scope():
model = build_model(4)
model.load_weights(ROBERTA_DISINFORMATION)
predicted = model.predict(X_test_tk)
y_prediction = np.argmax (predicted, axis = 1)
print(classification_report(y,y_prediction))  
print(accuracy_score(y, y_prediction))

Downloading:   0%|          | 0.00/461k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/266k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/715 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/506M [00:00<?, ?B/s]

Some layers from the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFRobertaModel were not initialized from the model checkpoint at jcblaise/roberta-tagalog-base and are newly initialized: ['roberta/pooler/dense/kernel:0', 'roberta/pooler/dense/bias:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super(Adam, self).__init__(name, **kwargs)


KeyboardInterrupt: ignored

In [71]:
X_test_tk = roberta_encode(['PH vaccine arrival stands at 64.9M doses; 9.5M doses marks highest weekly delivery'], tokenizer)
predicted = model.predict(X_test_tk)
y_prediction = np.argmax (predicted, axis = 1)
print(y_prediction)

[3]


In [41]:
unique, counts = np.unique(y_prediction, return_counts=True)
dict(zip(unique, counts))

{3: 31}