In [28]:
import torch
torch.cuda.is_available()

True

In [29]:
!pip install -U evaluate
!pip install -U datasets
!pip install -U accelerate
!pip install -U transformers
!pip install -q mlflow nlp 2>/dev/null

import numpy as np
import pandas as pd
import re
import evaluate
import accelerate
from datasets import load_dataset
from transformers import AutoTokenizer, pipeline
from transformers import TextDataset, LineByLineTextDataset, DataCollatorForLanguageModeling
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from nlp import Dataset



In [30]:
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/WELFake_Dataset.csv").drop(['Unnamed: 0'], axis=1)
print(df.shape)

df['title'] = df['title'] + '\n\n\n' + df['text']
df = df[['title', 'label']]
df["title"].fillna('Null', inplace=True)
# df = df[~df['title'].isnull()]
print(df.shape)
df.sample(5).T

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
(72134, 3)
(72134, 2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["title"].fillna('Null', inplace=True)


Unnamed: 0,61370,2189,60609,51565,39431
title,ARNOLD SCHWARZENEGGER Sends A Message To Liber...,WOW! “We Mexicans Need To Kill Donald Trump Be...,Jimmy Carter recovers from dehydration scare i...,2 Friars’ Mission: Reviving a Brooklyn Church ...,Boy With Autism Makes His First Friend Ever An...
label,1,1,0,0,1


In [31]:
# split test and training dataset
x_train, x_test, y_train, y_test = train_test_split(df["title"], df["label"], test_size=0.2, shuffle=True, random_state=42)
print(x_train.values[0:3])

['Elon Musk’s Tesla Stock Up $2 Billion Since Joining Trump’s Team\n\n\nAlthough Tesla CEO Elon Musk shocked Silicon Valley by breaking ranks to become an official member of the White House Manufacturing Jobs Initiative, the value of his Tesla stock is up by over $2 billion since the election of Donald Trump as President. [After a dizzying week of executive orders covering Obamacare, trade and immigration, President Trump held an approval rating of 55 percent of likely voters according to the Rasmussen Reports daily Presidential Tracking Poll. That is only slightly below the 59 percent level of President Obama when he left office.  Voters welcomed President Trump’s decision to scrap the   Partnership (TPP)   deal and agree that the North American Free Trade Agreement (NAFTA) with Mexico and Canada needs to be reworked. Although supporters argue free trade makes products cheaper for U. S. consumers, Americans believe  —   by a whopping 73 percent to 16 percent margin  —    it is more im

In [32]:
# prepare dataset for training
df_train = pd.DataFrame(columns=["title", "label"])
df_test = pd.DataFrame(columns=["title", "label"])
df_train["title"] = x_train
df_train["label"] = y_train
df_test["title"] = x_test
df_test["label"] = y_test
print(df_train[0:3])

df_train = Dataset.from_pandas(df_train)
df_test = Dataset.from_pandas(df_test)
print(df_train)

                                                   title  label
60264  Elon Musk’s Tesla Stock Up $2 Billion Since Jo...      0
42050  Wharton Business School Backers Seek Distance ...      0
62289  Border Patrol Agents Arrest Smuggler After Rol...      0
Dataset(features: {'title': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None), '__index_level_0__': Value(dtype='int64', id=None)}, num_rows: 57707)


In [33]:
model_checkpoint = "google/bert_uncased_L-2_H-128_A-2"
max_len = 512

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

def preprocess_function(examples):
    return tokenizer(examples["title"], max_length=max_len, padding="max_length", truncation=True)

df_train = df_train.map(preprocess_function, batched=True)
df_test = df_test.map(preprocess_function, batched=True)


  0%|          | 0/58 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

In [34]:
# model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2, hidden_dropout_prob=0.1)
model.resize_token_embeddings(len(tokenizer)) # need to resize due to new tokens added

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(30522, 128, padding_idx=0)

In [35]:
# metric_name = 'f1'
metric_name = 'accuracy'
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"./snapshots/{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    save_total_limit = 3,
    learning_rate=0.005,
    # learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    fp16=True
)

In [36]:
# Computation of evaluation metric
metric = evaluate.load(metric_name)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    # for f1 metric
    # return metric.compute(predictions=predictions, references=labels, average="micro")
    # for accuracy metric
    return metric.compute(predictions=predictions, references=labels)


In [37]:
trainer = Trainer(
    model,
    args,
    train_dataset=df_train,
    eval_dataset=df_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [38]:
train_log = trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.7184,0.73719,0.50863
2,0.7,0.692867,0.50863


In [39]:
trainer.evaluate()

{'eval_loss': 0.737189531326294,
 'eval_accuracy': 0.5086296527344563,
 'eval_runtime': 25.3091,
 'eval_samples_per_second': 570.032,
 'eval_steps_per_second': 71.279,
 'epoch': 2.0}

In [40]:
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device="cuda:0")
results = classifier(df_test["title"], max_length=max_len, padding="max_length", truncation=True)
dfResults = pd.DataFrame.from_dict(results)
dfResults['label'] = dfResults['label'].str.replace('LABEL_','')
metric = metric.compute(predictions=dfResults['label'].tolist(), references=df_test["label"])
print(metric)

{'accuracy': 0.5086296527344563}
