#### Mount drive, import libraries, models

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install spacy~=2.0
!spacy download en_core_web_sm
!pip install transformers==4.28.0

In [None]:
import pandas as pd

import re
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import transformers
import joblib

#from sentence_transformers import SentenceTransformer
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, BertModel, BertTokenizer, BertTokenizerFast
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers.models.bert.modeling_bert import BertForSequenceClassification
from transformers import Trainer, TrainingArguments

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#### Import and process news article dataset

In [None]:
#Import news article data json file (test set already set aside)
news_json = pd.read_json('/content/drive/My Drive/Data science/GNI87-json.json')

In [None]:
#Rename json file
df = news_json

#Import df of article metadata w/news_opinion labels
label_df = pd.read_csv("/content/drive/My Drive/Data science/All_Metadata.csv")

#Create dict of article IDs/news_opinion labels, map to full text df, and drop unlabelled articles and duplicates
label_dict = dict(zip(label_df['Article ID'], label_df['newsop']))

df["newsop"] = df["Article ID"].map(label_dict)

#Drop records with no label
df = df[-df["newsop"].isnull()].copy()

df = df.drop_duplicates('Article ID').drop_duplicates(["Media Name", "Body"])

#Convert labels to numeric format
df['target'] = np.where(df.newsop == "Opinion", 1, 0)

print(len(df))

#### Create test set

In [None]:
newsdf = df[df.newsop == 'News'].sample(n=10500, random_state=42).copy()
opdf = df[df.newsop == 'Opinion'].sample(n=10500, random_state=42).copy()
devdf = pd.concat([newsdf, opdf])

In [None]:
#Select only articles not used in development
dev_IDs = devdf['Article ID'].to_list()
testdf = df[-df['Article ID'].isin(dev_IDs)]

#Create balanced 10% sample
testnewsdf = testdf[testdf.newsop == 'News'].sample(n=2500, random_state=42).copy()
testopdf = testdf[testdf.newsop == 'Opinion'].sample(n=2500, random_state=42).copy()
testdf = pd.concat([testnewsdf, testopdf]).copy()

#Remove test set from overall dataset
test_IDs = testdf['Article ID'].to_list()
df = df[-df['Article ID'].isin(test_IDs)]

#### Create training set

In [None]:
newsdf = df[df.newsop == 'News'].sample(n=7500, random_state=42).copy()
opdf = df[df.newsop == 'Opinion'].sample(n=7500, random_state=42).copy()
df = pd.concat([newsdf, opdf])

In [None]:
len(df)

#### Strip leading metadata

In [None]:
#Strip Factiva style metadata pattern from training data
df["Body_clean"] = df["Body"].replace(r'(?:.*\s+)?Media: .*\s+(?:Byline|Author): .*\s+Date: .*\n' ,'', regex=True)

#Strip Nexis style metadata pattern
df['body_clean'] = np.where(df['Body_clean'].str.contains('BYLINE: '), df['Body_clean'].str.split('BYLINE: .*', regex=True, expand=True)[1], df['Body_clean'])
df['body_clean'] = np.where(df['body_clean'].str.contains('SECTION: '), df['body_clean'].str.split('SECTION: .*', regex=True, expand=True)[1], df['body_clean'])
df['body_clean'] = np.where(df['body_clean'].str.contains('LENGTH: '), df['body_clean'].str.split('LENGTH: .*', regex=True, expand=True)[1], df['body_clean'])
df['body_clean'] = np.where(df['body_clean'].str.contains('DATELINE: '), df['body_clean'].str.split('DATELINE: .*', regex=True, expand=True)[1], df['body_clean'])

In [None]:
#Strip Factiva style metadata pattern from test data
testdf["Body_clean"] = testdf["Body"].replace(r'(?:.*\s+)?Media: .*\s+(?:Byline|Author): .*\s+Date: .*\n' ,'', regex=True)

#Strip Nexis style metadata pattern
testdf['body_clean'] = np.where(testdf['Body_clean'].str.contains('BYLINE: '), testdf['Body_clean'].str.split('BYLINE: .*', regex=True, expand=True)[1], testdf['Body_clean'])
testdf['body_clean'] = np.where(testdf['body_clean'].str.contains('SECTION: '), testdf['body_clean'].str.split('SECTION: .*', regex=True, expand=True)[1], testdf['body_clean'])
testdf['body_clean'] = np.where(testdf['body_clean'].str.contains('LENGTH: '), testdf['body_clean'].str.split('LENGTH: .*', regex=True, expand=True)[1], testdf['body_clean'])
testdf['body_clean'] = np.where(testdf['body_clean'].str.contains('DATELINE: '), testdf['body_clean'].str.split('DATELINE: .*', regex=True, expand=True)[1], testdf['body_clean'])

#### Create input strings

In [None]:
df['start_end'] = np.where(df.body_clean.str.len() > 2000,
             df.body_clean.str[0:1000].fillna(' ').copy() + '\n*****\n' + df.body_clean.str[-1000:].fillna(' ').copy(),
             df.body_clean)

testdf['start_end'] = np.where(testdf.body_clean.str.len() > 2000,
             testdf.body_clean.str[0:1000].fillna(' ').copy() + '\n*****\n' + testdf.body_clean.str[-1000:].fillna(' ').copy(),
             testdf.body_clean)

In [None]:
df['metadata'] = df['Headline'].fillna(' ') + '\n*****\n' + df['Media Name'].fillna(' ') + '\n*****\n' + df['Journalist Name'].fillna(' ')

testdf['metadata'] = testdf['Headline'].fillna(' ') + '\n*****\n' + testdf['Media Name'].fillna(' ') + '\n*****\n' + testdf['Journalist Name'].fillna(' ')

#### Create labels

In [None]:
y_train = df['target'].copy()
y_test = testdf['target'].copy()

#### Define Dataset/helper functions

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    pre = precision_score(labels, preds)
    acc = accuracy_score(labels, preds)
    rec = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {
      'precision_score': pre,
      'accuracy_score': acc,
        "recall_score": rec,
        "f1_score": f1
    }

### Experiments

#### Start of article

In [None]:
X_train = df.body_clean.fillna(' ').copy()
X_test = testdf.body_clean.fillna(' ').copy()

In [None]:
model_name = 'bert-base-cased'
max_length = 512
tokenizer = BertTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)
test_encodings  = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)

train_dataset = MyDataset(train_encodings, y_train.tolist())
test_dataset = MyDataset(test_encodings, y_test.tolist())

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

training_args = TrainingArguments(
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    learning_rate=2e-5,              # initial learning rate for Adam optimizer
    warmup_steps=50,                # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
    weight_decay=0.01,               # strength of weight decay
    output_dir='results',          # output directory
    logging_dir='logs',            # directory for storing logs
    logging_steps=100,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics      # our custom evaluation function
)


In [None]:
trainer.train()
results = trainer.evaluate()
print(round(results['eval_f1_score'], 4))

trainer.save_model('models/start')

#### Metadata + Start of article


In [None]:
X_train = df.metadata.copy() + df.body_clean.copy()
X_test = testdf.metadata.copy() + testdf.body_clean.copy()

In [None]:
#BERT base Headline tokenizations
model_name = 'bert-base-cased'
max_length = 512
tokenizer = BertTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)
test_encodings  = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)

train_dataset = MyDataset(train_encodings, y_train.tolist())
test_dataset = MyDataset(test_encodings, y_test.tolist())

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

training_args = TrainingArguments(
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    learning_rate=2e-5,              # initial learning rate for Adam optimizer
    warmup_steps=50,                # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
    weight_decay=0.01,               # strength of weight decay
    output_dir='results',          # output directory
    logging_dir='logs',            # directory for storing logs
    logging_steps=100,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics      # our custom evaluation function
)


In [None]:
trainer.train()
results = trainer.evaluate()
print(round(results['eval_f1_score'], 4))

trainer.save_model('models/start_metadata')

#### Start of article + End of article

In [None]:
X_train = df['start_end'].copy()
X_test = testdf['start_end'].copy()


In [None]:
model_name = 'bert-base-cased'
max_length = 512
tokenizer = BertTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)
test_encodings  = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)

train_dataset = MyDataset(train_encodings, y_train.tolist())
test_dataset = MyDataset(test_encodings, y_test.tolist())

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

training_args = TrainingArguments(
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    learning_rate=2e-5,              # initial learning rate for Adam optimizer
    warmup_steps=50,                # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
    weight_decay=0.01,               # strength of weight decay
    output_dir='results',          # output directory
    logging_dir='logs',            # directory for storing logs
    logging_steps=100,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics      # our custom evaluation function
)


In [None]:
trainer.train()
results = trainer.evaluate()
print(round(results['eval_f1_score'], 4))

trainer.save_model('models/start_end')

#### Metadata + Start of article + End of article

In [None]:
X_train = df['metadata'].copy() + '\n****\n' + df.start_end.copy()
X_test = testdf['metadata'].copy() + '\n****\n' + testdf.start_end.copy()

In [None]:
#BERT base Headline tokenizations
model_name = 'bert-base-cased'
max_length = 512
tokenizer = BertTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)
test_encodings  = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)

train_dataset = MyDataset(train_encodings, y_train.tolist())
test_dataset = MyDataset(test_encodings, y_test.tolist())

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

training_args = TrainingArguments(
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    learning_rate=2e-5,              # initial learning rate for Adam optimizer
    warmup_steps=50,                # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
    weight_decay=0.01,               # strength of weight decay
    output_dir='results',          # output directory
    logging_dir='logs',            # directory for storing logs
    logging_steps=100,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics      # our custom evaluation function
)


In [None]:
trainer.train()
results = trainer.evaluate()
print(round(results['eval_f1_score'], 4))

trainer.save_model('models/metadata_start_end')

#### End of article + Metadata

In [None]:
X_train = df.Body.fillna(' ').copy() + '\n*****\n' + df['metadata'].copy()
X_test = testdf.Body.fillna(' ').copy() + '\n*****\n' + testdf['metadata'].copy()

In [None]:
#BERT base Headline tokenizations
model_name = 'bert-base-cased'
max_length = 512
tokenizer = BertTokenizer.from_pretrained(model_name, truncation_side='left')
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)
test_encodings  = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)

train_dataset = MyDataset(train_encodings, y_train.tolist())
test_dataset = MyDataset(test_encodings, y_test.tolist())

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

training_args = TrainingArguments(
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    learning_rate=2e-5,              # initial learning rate for Adam optimizer
    warmup_steps=50,                # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
    weight_decay=0.01,               # strength of weight decay
    output_dir='results',          # output directory
    logging_dir='logs',            # directory for storing logs
    logging_steps=100,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics      # our custom evaluation function
)


In [None]:
trainer.train()
results = trainer.evaluate()
print(round(results['eval_f1_score'], 4))

trainer.save_model('models/end_metadata')

#### End of article

In [None]:
X_train = df.Body.fillna(' ').copy()
X_test = testdf.Body.fillna(' ').copy()

In [None]:
#BERT base Headline tokenizations
model_name = 'bert-base-cased'
max_length = 512
tokenizer = BertTokenizer.from_pretrained(model_name, truncation_side='left')
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)
test_encodings  = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)

train_dataset = MyDataset(train_encodings, y_train.tolist())
test_dataset = MyDataset(test_encodings, y_test.tolist())

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

training_args = TrainingArguments(
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    learning_rate=2e-5,              # initial learning rate for Adam optimizer
    warmup_steps=50,                # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
    weight_decay=0.01,               # strength of weight decay
    output_dir='results',          # output directory
    logging_dir='logs',            # directory for storing logs
    logging_steps=100,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics      # our custom evaluation function
)


In [None]:
trainer.train()
results = trainer.evaluate()
print(round(results['eval_f1_score'], 4))

trainer.save_model('models/end')

#### Metadata, max_length=128

In [None]:
X_train = df.metadata.fillna(' ').copy()
X_test = testdf.metadata.fillna(' ').copy()

In [None]:
#BERT base Headline tokenizations
model_name = 'bert-base-cased'
max_length = 128
tokenizer = BertTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)
test_encodings  = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)

train_dataset = MyDataset(train_encodings, y_train.tolist())
test_dataset = MyDataset(test_encodings, y_test.tolist())

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

training_args = TrainingArguments(
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    learning_rate=2e-5,              # initial learning rate for Adam optimizer
    warmup_steps=50,                # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
    weight_decay=0.01,               # strength of weight decay
    output_dir='results',          # output directory
    logging_dir='logs',            # directory for storing logs
    logging_steps=100,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics      # our custom evaluation function
)


In [None]:
trainer.train()
results = trainer.evaluate()
print(round(results['eval_f1_score'], 4))

trainer.save_model('models/metadata')

#### Headline, max_length=64

In [None]:
#Pre-process Headline text using texthero
X_train = df['Headline'].fillna('').copy()
X_test = testdf['Headline'].fillna('').copy()

In [None]:
#BERT base Headline tokenizations
model_name = 'bert-base-cased'
max_length = 64
tokenizer = BertTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)
test_encodings  = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)

train_dataset = MyDataset(train_encodings, y_train.tolist())
test_dataset = MyDataset(test_encodings, y_test.tolist())

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

training_args = TrainingArguments(
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    learning_rate=2e-5,              # initial learning rate for Adam optimizer
    warmup_steps=50,                # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
    weight_decay=0.01,               # strength of weight decay
    output_dir='results',          # output directory
    logging_dir='logs',            # directory for storing logs
    logging_steps=100,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics      # our custom evaluation function
)


In [None]:
trainer.train()
results = trainer.evaluate()
print(round(results['eval_f1_score'], 4))

trainer.save_model('models/headline')