#### Mount drive, import libraries, models

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install spacy~=2.0
!spacy download en_core_web_sm
!pip install transformers==4.28.0

In [None]:
import pandas as pd

import re
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import transformers
import joblib

#from sentence_transformers import SentenceTransformer
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, BertModel, BertTokenizer, BertTokenizerFast
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers.models.bert.modeling_bert import BertForSequenceClassification
from transformers import Trainer, TrainingArguments

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#### Import and process news article dataset

In [None]:
#Import news article data json file (test set already set aside)
news_json = pd.read_json('/content/drive/My Drive/Data science/GNI87-json.json')

In [None]:
#Rename json file
df = news_json

#Import df of article metadata w/news_opinion labels
label_df = pd.read_csv("/content/drive/My Drive/Data science/All_Metadata.csv")

#Create dict of article IDs/news_opinion labels, map to full text df, and drop unlabelled articles and duplicates
label_dict = dict(zip(label_df['Article ID'], label_df['newsop']))

df["newsop"] = df["Article ID"].map(label_dict)

#Drop records with no label
df = df[-df["newsop"].isnull()].copy()

df = df.drop_duplicates('Article ID').drop_duplicates(["Media Name", "Body"])

#Convert labels to numeric format
df['target'] = np.where(df.newsop == "Opinion", 1, 0)

print(len(df))

#### Create test set

In [None]:
newsdf = df[df.newsop == 'News'].sample(n=10500, random_state=42).copy()
opdf = df[df.newsop == 'Opinion'].sample(n=10500, random_state=42).copy()
devdf = pd.concat([newsdf, opdf])

In [None]:
#Select only articles not used in development
dev_IDs = devdf['Article ID'].to_list()
testdf = df[-df['Article ID'].isin(dev_IDs)]

#Create balanced 10% sample
testnewsdf = testdf[testdf.newsop == 'News'].sample(n=2500, random_state=42).copy()
testopdf = testdf[testdf.newsop == 'Opinion'].sample(n=2500, random_state=42).copy()
testdf = pd.concat([testnewsdf, testopdf]).copy()

#Remove test set from overall dataset
test_IDs = testdf['Article ID'].to_list()
df = df[-df['Article ID'].isin(test_IDs)]

#### Create training set

In [None]:
newsdf = df[df.newsop == 'News'].sample(n=7500, random_state=42).copy()
opdf = df[df.newsop == 'Opinion'].sample(n=7500, random_state=42).copy()
df = pd.concat([newsdf, opdf])

In [None]:
len(df)

15000

#### Strip leading metadata

In [None]:
#Strip Factiva style metadata pattern from training data
df["Body_clean"] = df["Body"].replace(r'(?:.*\s+)?Media: .*\s+(?:Byline|Author): .*\s+Date: .*\n' ,'', regex=True)

#Strip Nexis style metadata pattern
df['body_clean'] = np.where(df['Body_clean'].str.contains('BYLINE: '), df['Body_clean'].str.split('BYLINE: .*', regex=True, expand=True)[1], df['Body_clean'])
df['body_clean'] = np.where(df['body_clean'].str.contains('SECTION: '), df['body_clean'].str.split('SECTION: .*', regex=True, expand=True)[1], df['body_clean'])
df['body_clean'] = np.where(df['body_clean'].str.contains('LENGTH: '), df['body_clean'].str.split('LENGTH: .*', regex=True, expand=True)[1], df['body_clean'])
df['body_clean'] = np.where(df['body_clean'].str.contains('DATELINE: '), df['body_clean'].str.split('DATELINE: .*', regex=True, expand=True)[1], df['body_clean'])

In [None]:
#Strip Factiva style metadata pattern from test data
testdf["Body_clean"] = testdf["Body"].replace(r'(?:.*\s+)?Media: .*\s+(?:Byline|Author): .*\s+Date: .*\n' ,'', regex=True)

#Strip Nexis style metadata pattern
testdf['body_clean'] = np.where(testdf['Body_clean'].str.contains('BYLINE: '), testdf['Body_clean'].str.split('BYLINE: .*', regex=True, expand=True)[1], testdf['Body_clean'])
testdf['body_clean'] = np.where(testdf['body_clean'].str.contains('SECTION: '), testdf['body_clean'].str.split('SECTION: .*', regex=True, expand=True)[1], testdf['body_clean'])
testdf['body_clean'] = np.where(testdf['body_clean'].str.contains('LENGTH: '), testdf['body_clean'].str.split('LENGTH: .*', regex=True, expand=True)[1], testdf['body_clean'])
testdf['body_clean'] = np.where(testdf['body_clean'].str.contains('DATELINE: '), testdf['body_clean'].str.split('DATELINE: .*', regex=True, expand=True)[1], testdf['body_clean'])

#### Create input strings

In [None]:
df['start_end'] = np.where(df.body_clean.str.len() > 2000,
             df.body_clean.str[0:1000].fillna(' ').copy() + '\n*****\n' + df.body_clean.str[-1000:].fillna(' ').copy(),
             df.body_clean)

testdf['start_end'] = np.where(testdf.body_clean.str.len() > 2000,
             testdf.body_clean.str[0:1000].fillna(' ').copy() + '\n*****\n' + testdf.body_clean.str[-1000:].fillna(' ').copy(),
             testdf.body_clean)

In [None]:
df['metadata'] = df['Headline'].fillna(' ') + '\n*****\n' + df['Media Name'].fillna(' ') + '\n*****\n' + df['Journalist Name'].fillna(' ')

testdf['metadata'] = testdf['Headline'].fillna(' ') + '\n*****\n' + testdf['Media Name'].fillna(' ') + '\n*****\n' + testdf['Journalist Name'].fillna(' ')

#### Create labels

In [None]:
y_train = df['target'].copy()
y_test = testdf['target'].copy()

#### Define Dataset/helper functions

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    pre = precision_score(labels, preds)
    acc = accuracy_score(labels, preds)
    rec = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {
      'precision_score': pre,
      'accuracy_score': acc,
        "recall_score": rec,
        "f1_score": f1
    }

### Experiments

#### Baseline experiment w/no preprocessing

In [None]:
X_train = df.body_clean.fillna(' ').copy()
X_test = testdf.body_clean.fillna(' ').copy()

In [None]:
model_name = 'bert-base-cased'
max_length = 512
tokenizer = BertTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)
test_encodings  = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)

train_dataset = MyDataset(train_encodings, y_train.tolist())
test_dataset = MyDataset(test_encodings, y_test.tolist())

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

training_args = TrainingArguments(
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    learning_rate=2e-5,              # initial learning rate for Adam optimizer
    warmup_steps=50,                # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
    weight_decay=0.01,               # strength of weight decay
    output_dir='results',          # output directory
    logging_dir='logs',            # directory for storing logs
    logging_steps=100,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics      # our custom evaluation function
)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [None]:
trainer.train()
results = trainer.evaluate()
print(round(results['eval_f1_score'], 4))

trainer.save_model('models/bert_model')



Step,Training Loss,Validation Loss,Precision Score,Accuracy Score,Recall Score,F1 Score
100,0.504,0.401634,0.917124,0.8342,0.7348,0.815901
200,0.3656,0.323349,0.867264,0.8774,0.8912,0.879069
300,0.3267,0.306857,0.870229,0.888,0.912,0.890625
400,0.3155,0.302582,0.888181,0.8902,0.8928,0.890485
500,0.2984,0.307801,0.89694,0.8944,0.8912,0.894061
600,0.2906,0.28394,0.867263,0.894,0.9304,0.897723
700,0.3154,0.272003,0.883712,0.8986,0.918,0.90053
800,0.2843,0.287574,0.882443,0.8982,0.9188,0.900255
900,0.2773,0.287935,0.89165,0.9034,0.9184,0.904828
1000,0.2447,0.301779,0.867658,0.8956,0.9336,0.899422


0.9037


#### Metadata + body (start)


In [None]:
X_train = df.metadata.copy() + df.body_clean.copy()
X_test = testdf.metadata.copy() + testdf.body_clean.copy()

In [None]:
#BERT base Headline tokenizations
model_name = 'bert-base-cased'
max_length = 512
tokenizer = BertTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)
test_encodings  = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)

train_dataset = MyDataset(train_encodings, y_train.tolist())
test_dataset = MyDataset(test_encodings, y_test.tolist())

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

training_args = TrainingArguments(
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    learning_rate=2e-5,              # initial learning rate for Adam optimizer
    warmup_steps=50,                # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
    weight_decay=0.01,               # strength of weight decay
    output_dir='results',          # output directory
    logging_dir='logs',            # directory for storing logs
    logging_steps=100,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics      # our custom evaluation function
)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [None]:
trainer.train()
results = trainer.evaluate()
print(round(results['eval_f1_score'], 4))

trainer.save_model('models/body_start_metadata')



Step,Training Loss,Validation Loss,Precision Score,Accuracy Score,Recall Score,F1 Score
100,0.5435,0.383792,0.811973,0.857619,0.926853,0.865618
200,0.3384,0.305672,0.842601,0.886667,0.948027,0.89221
300,0.3084,0.249658,0.916914,0.906667,0.892204,0.90439
400,0.2725,0.269684,0.918651,0.907143,0.891242,0.904739
500,0.2725,0.249066,0.943098,0.905714,0.861405,0.900402
600,0.269,0.234824,0.936428,0.917143,0.893167,0.914286
700,0.2329,0.282857,0.95524,0.902381,0.842156,0.895141
800,0.2826,0.234188,0.949738,0.914286,0.872955,0.909729
900,0.2446,0.208697,0.915033,0.928571,0.943215,0.92891
1000,0.2357,0.207958,0.925785,0.931429,0.936477,0.9311


#### Start of Article + End of Article

In [None]:
X_train = df['start_end'].copy()
X_test = testdf['start_end'].copy()


In [None]:
model_name = 'bert-base-cased'
max_length = 512
tokenizer = BertTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)
test_encodings  = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)

train_dataset = MyDataset(train_encodings, y_train.tolist())
test_dataset = MyDataset(test_encodings, y_test.tolist())

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

training_args = TrainingArguments(
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    learning_rate=2e-5,              # initial learning rate for Adam optimizer
    warmup_steps=50,                # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
    weight_decay=0.01,               # strength of weight decay
    output_dir='results',          # output directory
    logging_dir='logs',            # directory for storing logs
    logging_steps=100,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics      # our custom evaluation function
)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [None]:
trainer.train()
results = trainer.evaluate()
print(round(results['eval_f1_score'], 4))

trainer.save_model('models/start_end')



Step,Training Loss,Validation Loss,Precision Score,Accuracy Score,Recall Score,F1 Score
100,0.4926,0.33731,0.909968,0.884286,0.842262,0.874807
200,0.3338,0.295872,0.829016,0.882857,0.952381,0.886427
300,0.3339,0.267246,0.873596,0.9,0.925595,0.898844
400,0.2594,0.233129,0.917889,0.927143,0.931548,0.924668
500,0.242,0.248252,0.949045,0.922857,0.886905,0.916923
600,0.2125,0.239974,0.930723,0.928571,0.919643,0.92515
700,0.2251,0.242518,0.960784,0.922857,0.875,0.915888
800,0.2142,0.249626,0.954984,0.924286,0.883929,0.918083
900,0.1792,0.234548,0.935976,0.928571,0.91369,0.924699
1000,0.1614,0.268475,0.928358,0.93,0.925595,0.926975


0.9261


#### Metadata + Start of article + End of article

In [None]:
X_train = df['metadata'].copy() + '\n****\n' + df.start_end.copy()
X_test = testdf['metadata'].copy() + '\n****\n' + testdf.start_end.copy()

In [None]:
#BERT base Headline tokenizations
model_name = 'bert-base-cased'
max_length = 512
tokenizer = BertTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)
test_encodings  = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)

train_dataset = MyDataset(train_encodings, y_train.tolist())
test_dataset = MyDataset(test_encodings, y_test.tolist())

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

training_args = TrainingArguments(
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    learning_rate=2e-5,              # initial learning rate for Adam optimizer
    warmup_steps=50,                # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
    weight_decay=0.01,               # strength of weight decay
    output_dir='results',          # output directory
    logging_dir='logs',            # directory for storing logs
    logging_steps=100,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics      # our custom evaluation function
)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [None]:
trainer.train()
results = trainer.evaluate()
print(round(results['eval_f1_score'], 4))

trainer.save_model('models/metadata_start_end')



Step,Training Loss,Validation Loss,Precision Score,Accuracy Score,Recall Score,F1 Score
100,0.5009,0.297612,0.881207,0.8915,0.905,0.892945
200,0.3488,0.278995,0.866917,0.8915,0.925,0.895017
300,0.2809,0.250098,0.905567,0.908,0.911,0.908275
400,0.2903,0.231334,0.910872,0.9195,0.93,0.920336
500,0.2404,0.25625,0.92378,0.917,0.909,0.916331
600,0.2488,0.254912,0.943677,0.9175,0.888,0.914992
700,0.211,0.234576,0.924303,0.926,0.928,0.926148
800,0.1871,0.280454,0.890244,0.916,0.949,0.918683
900,0.2022,0.264564,0.929293,0.925,0.92,0.924623
1000,0.1986,0.255331,0.931034,0.925,0.918,0.924471


0.9303


#### End of article + metadata

In [None]:
X_train = df.Body.fillna(' ').copy() + '\n*****\n' + df['metadata'].copy()
X_test = testdf.Body.fillna(' ').copy() + '\n*****\n' + testdf['metadata'].copy()

In [None]:
#BERT base Headline tokenizations
model_name = 'bert-base-cased'
max_length = 512
tokenizer = BertTokenizer.from_pretrained(model_name, truncation_side='left')
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)
test_encodings  = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)

train_dataset = MyDataset(train_encodings, y_train.tolist())
test_dataset = MyDataset(test_encodings, y_test.tolist())

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

training_args = TrainingArguments(
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    learning_rate=2e-5,              # initial learning rate for Adam optimizer
    warmup_steps=50,                # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
    weight_decay=0.01,               # strength of weight decay
    output_dir='results',          # output directory
    logging_dir='logs',            # directory for storing logs
    logging_steps=100,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics      # our custom evaluation function
)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [None]:
trainer.train()
results = trainer.evaluate()
print(round(results['eval_f1_score'], 4))

trainer.save_model('models/end_metadata')



Step,Training Loss,Validation Loss,Precision Score,Accuracy Score,Recall Score,F1 Score
100,0.5272,0.327592,0.893861,0.8785,0.859,0.876084
200,0.3663,0.528618,0.755521,0.824,0.958,0.844797
300,0.3635,0.293525,0.929032,0.899,0.864,0.895337
400,0.3274,0.291343,0.846709,0.8845,0.939,0.890469
500,0.2884,0.280945,0.899804,0.907,0.916,0.90783
600,0.27,0.269364,0.918605,0.896,0.869,0.893114
700,0.228,0.28378,0.927505,0.901,0.87,0.897833
800,0.2264,0.306506,0.906439,0.904,0.901,0.903711
900,0.2354,0.250528,0.910803,0.9145,0.919,0.914883
1000,0.228,0.296777,0.926349,0.911,0.893,0.909369


0.9128


#### End of article

In [None]:
X_train = df.Body.fillna(' ').copy()
X_test = testdf.Body.fillna(' ').copy()

In [None]:
#BERT base Headline tokenizations
model_name = 'bert-base-cased'
max_length = 512
tokenizer = BertTokenizer.from_pretrained(model_name, truncation_side='left')
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)
test_encodings  = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)

train_dataset = MyDataset(train_encodings, y_train.tolist())
test_dataset = MyDataset(test_encodings, y_test.tolist())

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

training_args = TrainingArguments(
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    learning_rate=2e-5,              # initial learning rate for Adam optimizer
    warmup_steps=50,                # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
    weight_decay=0.01,               # strength of weight decay
    output_dir='results',          # output directory
    logging_dir='logs',            # directory for storing logs
    logging_steps=100,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics      # our custom evaluation function
)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [None]:
trainer.train()
results = trainer.evaluate()
print(round(results['eval_f1_score'], 4))

trainer.save_model('models/end')



Step,Training Loss,Validation Loss,Precision Score,Accuracy Score,Recall Score,F1 Score
100,0.5207,0.348769,0.908267,0.8605,0.802,0.851832
200,0.397,0.293673,0.893145,0.89,0.886,0.889558
300,0.2917,0.280139,0.881553,0.893,0.908,0.894581
400,0.3077,0.283502,0.905102,0.897,0.887,0.89596
500,0.2755,0.285982,0.930283,0.895,0.854,0.890511
600,0.2721,0.329055,0.940698,0.879,0.809,0.869892
700,0.2276,0.338202,0.935484,0.878,0.812,0.869379
800,0.2248,0.315236,0.916842,0.896,0.871,0.893333


#### Metadata, max_length=128

In [None]:
X_train = df.metadata.fillna(' ').copy()
X_test = testdf.metadata.fillna(' ').copy()

In [None]:
#BERT base Headline tokenizations
model_name = 'bert-base-cased'
max_length = 128
tokenizer = BertTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)
test_encodings  = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)

train_dataset = MyDataset(train_encodings, y_train.tolist())
test_dataset = MyDataset(test_encodings, y_test.tolist())

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

training_args = TrainingArguments(
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    learning_rate=2e-5,              # initial learning rate for Adam optimizer
    warmup_steps=50,                # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
    weight_decay=0.01,               # strength of weight decay
    output_dir='results',          # output directory
    logging_dir='logs',            # directory for storing logs
    logging_steps=100,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics      # our custom evaluation function
)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [None]:
trainer.train()
results = trainer.evaluate()
print(round(results['eval_f1_score'], 4))

trainer.save_model('models/metadata')



Step,Training Loss,Validation Loss,Precision Score,Accuracy Score,Recall Score,F1 Score
100,0.5785,0.454434,0.758598,0.8098,0.9088,0.826934
200,0.3858,0.37099,0.82914,0.8506,0.8832,0.855317
300,0.3454,0.373734,0.777043,0.835,0.9396,0.850625
400,0.3177,0.313906,0.859076,0.8608,0.8632,0.861133
500,0.3201,0.367132,0.804393,0.8548,0.9376,0.865903
600,0.3204,0.32207,0.805337,0.857,0.9416,0.868154
700,0.3348,0.301197,0.865265,0.8752,0.8888,0.876875
800,0.307,0.310657,0.842494,0.8714,0.9136,0.876607
900,0.3054,0.30867,0.903129,0.8762,0.8428,0.871922
1000,0.2424,0.366881,0.890968,0.8844,0.876,0.883421


0.8887


In [None]:
model_save_name = 'metadata.pt'
path = F"/content/drive/MyDrive/{model_save_name}"
torch.save(model.state_dict(), path)

In [None]:
trainer.save_model('/content/drive/MyDrive/models/metadata')

NameError: ignored

#### Headline, max_length=64

In [None]:
#Pre-process Headline text using texthero
X_train = df['Headline'].fillna('').copy()
X_test = testdf['Headline'].fillna('').copy()

In [None]:
#BERT base Headline tokenizations
model_name = 'bert-base-cased'
max_length = 64
tokenizer = BertTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)
test_encodings  = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=max_length, return_attention_mask=True)

train_dataset = MyDataset(train_encodings, y_train.tolist())
test_dataset = MyDataset(test_encodings, y_test.tolist())

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

training_args = TrainingArguments(
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    learning_rate=2e-5,              # initial learning rate for Adam optimizer
    warmup_steps=50,                # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
    weight_decay=0.01,               # strength of weight decay
    output_dir='results',          # output directory
    logging_dir='logs',            # directory for storing logs
    logging_steps=100,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics      # our custom evaluation function
)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [None]:
trainer.train()
results = trainer.evaluate()
print(round(results['eval_f1_score'], 4))

trainer.save_model('models/bert_headline_model')



Step,Training Loss,Validation Loss,Precision Score,Accuracy Score,Recall Score,F1 Score
100,0.6525,0.504761,0.683146,0.752857,0.904762,0.778489
200,0.4298,0.412854,0.81791,0.824286,0.815476,0.816692
300,0.4108,0.38379,0.775457,0.821429,0.883929,0.826147
400,0.3512,0.383127,0.792,0.832857,0.883929,0.835443
500,0.2717,0.437273,0.865204,0.852857,0.821429,0.842748
600,0.2945,0.390629,0.80654,0.841429,0.880952,0.842105
700,0.274,0.405914,0.849693,0.845714,0.824405,0.836858
800,0.252,0.38732,0.854545,0.854286,0.839286,0.846847
900,0.1943,0.432231,0.868098,0.862857,0.842262,0.854985
1000,0.1761,0.462393,0.852507,0.861429,0.860119,0.856296
