In [1]:
# A dependency of the preprocessing for BERT inputs
!pip install -q -U "tensorflow-text==2.8.*"

In [2]:
!pip install -q tf-models-official==2.7.0

In [3]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
from keras.callbacks import EarlyStopping

import json
import re
import nltk
from nltk.corpus import stopwords

2022-12-05 17:04:29.742217: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-05 17:04:29.742262: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## get target data from source data

In [4]:
input_file_dir = 'finalMergedData.json'
rawDf = pd.read_json(input_file_dir)
dF = rawDf[["source","content","bias_text"]]
dF['bias_text'] = dF['bias_text'].str.capitalize()
#df.head()
news_source = dF['source'].unique()
#print(news_source.shape)
bias_label = dF['bias_text'].unique()
#print(bias_label)
#df.groupby('bias_text').describe()
df_filtered = dF[dF['bias_text'] != "Political news media bias rating: not rated"]  
df_filtered = df_filtered[df_filtered['bias_text'] != "Mixed"]
df_filtered = df_filtered.drop_duplicates(subset=['content'], keep='first')
df_filtered = df_filtered[(df_filtered['content'] != "")]
#df_filtered.describe()
#df_filtered.groupby('bias_text').describe()
df_filtered = df_filtered[df_filtered['bias_text'] != "Lean left"]
df_filtered = df_filtered[df_filtered['bias_text'] != "Lean right"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dF['bias_text'] = dF['bias_text'].str.capitalize()


## begin cleaning text

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/dkang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
#    text = re.sub(r'\W+', '', text) # this removes all special characters # r means raw string
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwords from text
    return text
df_filtered['content'] = df_filtered['content'].apply(clean_text)
# remove all digits
df_filtered['content'] = df_filtered['content'].str.replace('\d+', '')

df_filtered.head()

  df_filtered['content'] = df_filtered['content'].str.replace('\d+', '')


Unnamed: 0,source,content,bias_text
0,Townhall,twotoone margin respondents participated recen...,Right
1,Townhall,president trump loves tweet way directly reach...,Right
2,New York Times - News,secret service chief revelations could threate...,Left
3,NPR Online News,trump ousts embattled campaign managerenlarge ...,Center
4,BBC News,media playback unsupported device media captio...,Center


In [7]:
def countNumWordsOfContent(text):
  return len(text.split())

# remove those articles with actual words less than 30
df_filtered = df_filtered[df_filtered['content'].apply(countNumWordsOfContent) >= 50]

In [8]:
df_filtered = df_filtered.drop_duplicates(subset=['content'], keep='first')
df_filtered = df_filtered[(df_filtered['content'] != "")]
df_filtered.groupby('bias_text').describe()

Unnamed: 0_level_0,source,source,source,source,content,content,content,content
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
bias_text,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Center,12472,279,NPR Online News,2012,12472,12472,trump ousts embattled campaign managerenlarge ...,1
Left,14886,173,CNN (Web News),2905,14886,14886,secret service chief revelations could threate...,1
Right,15571,175,Washington Times,2884,15571,15571,twotoone margin respondents participated recen...,1


## Split data -- removing media source

In [9]:
#news_source is ndarray
def clean_media_source(text):
    text = ' '.join(word for word in text.split() if word not in news_source) # remove media sources from text
    return text
df_filtered['content'] = df_filtered['content'].apply(clean_media_source)

In [10]:
Y = pd.get_dummies(df_filtered['bias_text']).values
X = df_filtered['content'].values
# split data
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(38636,) (38636, 3)
(4293,) (4293, 3)


## Build model

In [11]:
bert_model_name = 'bert_en_uncased_L-12_H-768_A-12' 

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1'
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


In [12]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

2022-12-05 17:12:14.735286: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-05 17:12:14.735350: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2022-12-05 17:12:14.735385: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2022-12-05 17:12:14.737267: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory
2022-12-05 17:12:14.737311: W tensorflow/stream_executor/platform/default/dso_loader

In [13]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [14]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  pooled_output = outputs['pooled_output']
  lay = tf.keras.layers.Dense(64, activation='relu')(pooled_output)
  lay = tf.keras.layers.Dropout(0.2)(lay)
  net = tf.keras.layers.Dense(3, activation='softmax', name='classifier')(lay)
  return tf.keras.Model(text_input, net)

In [15]:
classifier_model = build_classifier_model()

In [16]:
loss = tf.keras.losses.CategoricalCrossentropy()
metrics = tf.metrics.CategoricalAccuracy()

In [17]:
epochs = 5
batch_size = 64
steps_per_epoch = len(X_train)
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

## Train and test model

In [18]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [None]:
print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.1)])

Training model with https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4
Epoch 1/5

In [None]:
# convert the history.history dict to a pandas DataFrame:     
hist_df = pd.DataFrame(history.history) 

# save to csv:  
hist_csv_file = 'history_BERT_experiment2.csv'
with open(hist_csv_file, mode='w') as f:
    hist_df.to_csv(f)

In [None]:
accr = classifier_model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))
f = open("result_BERT_experiment2.txt", "w")
f.write('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))
f.close()