In [1]:
!pip install transformers
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
import numpy as np
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import EarlyStopping


Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 4.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 49.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 41.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.5-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 24.3 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Foun

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Using Titles

In [3]:
df = pd.read_json('/content/drive/MyDrive/fake_news_project/biased_news/tf_data/balanced_data_extremes.json')
df['bias'] += 1
df = df[['title', 'bias']]
train_set, test_set = train_test_split(df, test_size=0.20, random_state=42)

In [4]:
df['bias'].value_counts()

0    39787
1    39787
2    39787
Name: bias, dtype: int64

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [6]:
def biased_news_gen_title(df, tokenizer):
    def g():
        for row in df.itertuples():
            text = row.title
            label = row.bias
            tokenized = tokenizer(row.title, 
                                  max_length=32, 
                                  padding='max_length', 
                                  truncation=True)
            yield {k: np.array(tokenized[k]) for k in tokenized}, label
    return g

input_names = ['input_ids', 'token_type_ids', 'attention_mask']
data_types = ({k: tf.int32 for k in input_names}, tf.int64)
data_shapes = ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([]))

biased_news_train = tf.data.Dataset.from_generator(
    biased_news_gen_title(train_set, tokenizer),
    data_types, data_shapes
).shuffle(100).batch(32).repeat(4)

biased_news_test = tf.data.Dataset.from_generator(
    biased_news_gen_title(test_set, tokenizer),
    data_types, data_shapes
).shuffle(100).batch(32).repeat(4)

In [7]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
model.compile(
                    optimizer=Adam(learning_rate=3e-5),
                    loss=SparseCategoricalCrossentropy(from_logits=True),
                    metrics='accuracy'
                  )

In [9]:
early_stopping = EarlyStopping(monitor='val_loss',
                               patience=2,
                               restore_best_weights=True,
                               verbose=1)

In [10]:
history = model.fit(
                    biased_news_train,
                    validation_data=biased_news_test, 
                    epochs=15,
                    steps_per_epoch=128,
                    validation_steps=32,
                    callbacks=[early_stopping])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 11: early stopping


## Using Content

In [None]:
df = pd.read_json('/content/drive/MyDrive/fake_news_project/biased_news/tf_data/balanced_data_extremes.json')
df['bias'] += 1
df = df[['content', 'bias']]
train_set, test_set = train_test_split(df, test_size=0.20, random_state=42)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def biased_news_gen_content(df, tokenizer):
    def g():
        for row in df.itertuples():
            text = row.content
            label = row.bias
            tokenized = tokenizer(row.content, 
                                  max_length=512, 
                                  padding='max_length', 
                                  truncation=True)
            yield {k: np.array(tokenized[k]) for k in tokenized}, label
    return g

input_names = ['input_ids', 'token_type_ids', 'attention_mask']
data_types = ({k: tf.int32 for k in input_names}, tf.int64)
data_shapes = ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([]))

biased_news_train = tf.data.Dataset.from_generator(
    biased_news_gen_content(train_set, tokenizer),
    data_types, data_shapes
).shuffle(100).batch(8).repeat(4)

biased_news_test = tf.data.Dataset.from_generator(
    biased_news_gen_content(test_set, tokenizer),
    data_types, data_shapes
).shuffle(100).batch(8).repeat(4)

In [None]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

In [None]:
model.compile(
                    optimizer=Adam(learning_rate=3e-5),
                    loss=SparseCategoricalCrossentropy(from_logits=True),
                    metrics='accuracy'
                  )

In [None]:
early_stopping = EarlyStopping(monitor='val_loss',
                               patience=2,
                               restore_best_weights=True,
                               verbose=1)

In [None]:
history = model.fit(
                    biased_news_train,
                    validation_data=biased_news_test, 
                    epochs=15,
                    steps_per_epoch=128,
                    validation_steps=32,
                    callbacks=[early_stopping])

## Using both title and content

In [3]:
df = pd.read_json('/content/drive/MyDrive/fake_news_project/biased_news/tf_data/balanced_data_extremes.json')
df['bias'] += 1
df = df[['title', 'content', 'bias']]
train_set, test_set = train_test_split(df, test_size=0.20, random_state=42)

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
def biased_news_gen_title_content(df, tokenizer):
    def g():
        for row in df.itertuples():
            text = row.title + ' ' + row.content
            label = row.bias
            tokenized = tokenizer(text, 
                                  max_length=256, 
                                  padding='max_length', 
                                  truncation=True)
            yield {k: np.array(tokenized[k]) for k in tokenized}, label
    return g

input_names = ['input_ids', 'token_type_ids', 'attention_mask']
data_types = ({k: tf.int32 for k in input_names}, tf.int64)
data_shapes = ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([]))

biased_news_train = tf.data.Dataset.from_generator(
    biased_news_gen_title_content(train_set, tokenizer),
    data_types, data_shapes
).shuffle(100).batch(16).repeat(4)

biased_news_test = tf.data.Dataset.from_generator(
    biased_news_gen_title_content(test_set, tokenizer),
    data_types, data_shapes
).shuffle(100).batch(16).repeat(4)

In [6]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
model.compile(
                    optimizer=Adam(learning_rate=4e-5),
                    loss=SparseCategoricalCrossentropy(from_logits=True),
                    metrics='accuracy'
                  )

In [8]:
early_stopping = EarlyStopping(monitor='val_loss',
                               patience=4,
                               restore_best_weights=True,
                               verbose=1)

In [9]:
history = model.fit(
                    biased_news_train,
                    validation_data=biased_news_test, 
                    epochs=30,
                    steps_per_epoch=128,
                    validation_steps=32,
                    callbacks=[early_stopping])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 18: early stopping


In [12]:
model.evaluate(biased_news_test)



[0.36471977829933167, 0.8560717105865479]

In [14]:
model.save_pretrained('/content/drive/MyDrive/final_bias_news_model')

## Using left and right only

In [3]:
df = pd.read_json('/content/drive/MyDrive/fake_news_project/biased_news/tf_data/balanced_data_extremes.json')
df['bias'] += 1
df = df[['title', 'content', 'bias']]
df = df[df['bias'].isin([0, 2])]
df.loc[df['bias'] == 2, 'bias'] = 1
train_set, test_set = train_test_split(df, test_size=0.20, random_state=42)

In [4]:
df['bias'].value_counts()

0    39787
1    39787
Name: bias, dtype: int64

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [6]:
def biased_news_gen_title_content(df, tokenizer):
    def g():
        for row in df.itertuples():
            text = row.title + ' ' + row.content
            label = row.bias
            tokenized = tokenizer(text, 
                                  max_length=256, 
                                  padding='max_length', 
                                  truncation=True)
            yield {k: np.array(tokenized[k]) for k in tokenized}, label
    return g

input_names = ['input_ids', 'token_type_ids', 'attention_mask']
data_types = ({k: tf.int32 for k in input_names}, tf.int64)
data_shapes = ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([]))

biased_news_train = tf.data.Dataset.from_generator(
    biased_news_gen_title_content(train_set, tokenizer),
    data_types, data_shapes
).shuffle(100).batch(16).repeat(4)

biased_news_test = tf.data.Dataset.from_generator(
    biased_news_gen_title_content(test_set, tokenizer),
    data_types, data_shapes
).shuffle(100).batch(16).repeat(4)

In [7]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
model.compile(
                    optimizer=Adam(learning_rate=3e-5),
                    loss=SparseCategoricalCrossentropy(from_logits=True),
                    metrics='accuracy'
                  )

In [9]:
early_stopping = EarlyStopping(monitor='val_loss',
                               patience=4,
                               restore_best_weights=True,
                               verbose=1)

In [10]:
 history = model.fit(
                    biased_news_train,
                    validation_data=biased_news_test, 
                    epochs=30,
                    steps_per_epoch=128,
                    validation_steps=32,
                    callbacks=[early_stopping])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 16: early stopping
