In [None]:
%pip install "pillow<7"

In [None]:
%pip install scikit-learn
%pip install nltk pandas numpy matplotlib seaborn wordcloud transformers torch Lightning
%pip install --upgrade pip accelerate
%pip install fastcore -U

In [None]:
#El dataset debe estar en "./data/train_data.csv"
%ls

In [None]:
from os import listdir
from os.path import isfile, join
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import json
import matplotlib
import matplotlib.font_manager as fm
import pathlib

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
data_file = "train_data.csv.csv"
news_d = pd.read_csv("data/"+data_file, sep='\t', lineterminator='\r')
print("Shape of News data:", news_d.shape)
print("News data columns", news_d.columns)
news_d.head()

In [None]:
#Establecer directorios de salida

out_path = "fake-news-javi"

model_name = "dccuchile/bert-base-spanish-wwm-uncased"

images = "./"+out_path+"/images/"

if not os.path.isdir("./"+out_path):
      os.mkdir("./"+out_path)

if not os.path.isdir(images):
  os.mkdir(images)

In [None]:
txt_length = news_d.text.str.split().str.len()
txt_length.describe()

In [None]:
label_plot = sns.countplot(x="label", data=news_d);
label_plot.set(xlabel='Label', ylabel='Number of news')
fig = label_plot.get_figure()
fig.savefig(images+"label_percentage.png", dpi=300)
print("1: Fake")
print("0: True")
print("Distribution of labels:")
print(news_d.label.value_counts())
print(round(news_d.label.value_counts(normalize=True),2)*100)

In [None]:
seq_len = [len(title.split()) for title in news_d.text]
pd.Series(seq_len).hist(bins = 40,color='firebrick')
plt.xlabel('Number of words')
plt.ylabel('Number of news')
plt.savefig(images+'words_texts.png', dpi=1200)

In [None]:
column_n = ['text', 'label']
remove_c = []
categorical_features = []
target_col = ['label']
text_f = ['text']

In [None]:
# Clean Datasets
import nltk
from nltk.corpus import stopwords
import re
from nltk.stem.porter import PorterStemmer
from collections import Counter

ps = PorterStemmer()
wnl = nltk.stem.WordNetLemmatizer()

stop_words = stopwords.words('spanish')
stopwords_dict = Counter(stop_words)

# Removed unused clumns
def remove_unused_c(df,column_n=remove_c):
    df = df.drop(column_n,axis=1)
    return df

# Impute null values with None
def null_process(feature_df):
    for col in text_f:
        feature_df.loc[feature_df[col].isnull(), col] = "None"
    return feature_df

def clean_dataset(df):
    # remove unused column
    df = remove_unused_c(df)
    #impute null values
    df = null_process(df)
    return df

# Cleaning text from unused characters
def clean_text(text):
    text = str(text).replace(r'http[\w:/\.]+', ' ')  # removing urls
    text = str(text).replace(r'[^\.\w\s]', ' ')  # remove everything but characters and punctuation
    text = str(text).replace('[^a-zA-Z]', ' ')
    text = str(text).replace(r'\s\s+', ' ')
    text = text.lower().strip()
    return text

## Nltk Preprocessing include:
# Stop words, Stemming and Lemmetization
# For our project we use only Stop word removal
def nltk_preprocess(text):
    text = clean_text(text)
    wordlist = re.sub(r'[^\w\s]', '', text).split()
    text = ' '.join([wnl.lemmatize(word) for word in wordlist if word not in stopwords_dict])
    return  text

In [None]:
df = clean_dataset(news_d)
df["text"] = df.text.apply(nltk_preprocess)
df.head()

In [None]:
data_stats_file = "data/stats_"+data_file+".json"
data_stats_dicc = {}

def write_json(data):
    with open(data_stats_file, 'w', encoding="UTF-8") as fp:
        json_dumps_str = json.dumps(data, indent=4)
        fp.write(json_dumps_str)

def intersection(true_n, fake_n, n=2000):
  words_trues = []
  words_fakes = []

  trues = (pd.Series(nltk.ngrams(true_n.split(), 1)).value_counts())[:n].keys().tolist()
  fakes = (pd.Series(nltk.ngrams(fake_n.split(), 1)).value_counts())[:n].keys().tolist()
  for x in trues:
    words_trues.append(x[0])
  for y in fakes:
    words_fakes.append(y[0])

  #print(words_trues)
  #print(words_fakes)

  intersec = list(set(words_trues) & set(words_fakes))

  return (len(intersec)/len(words_trues))*100

true_n = ' '.join(df[df['label']==0]['text'])
fake_n = ' '.join(df[df['label']==1]['text'])

for x in range(500,5001,500):
  val = intersection(true_n,fake_n, x)
  print(val)
  data_stats_dicc[str(x)] = val
data_stats_dicc["name"] = data_file
write_json(data_stats_dicc)

In [None]:
for x in range(500,5001,500):
  print(x)

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

def plot_top_ngrams(corpus, title, ylabel, type, xlabel="Number of occurences", n=2):
    true_b = (pd.Series(nltk.ngrams(corpus.split(), n)).value_counts())[:30]
    true_b.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
    plt.title(title)
    plt.ylabel(ylabel)
    plt.xlabel(xlabel)
    #plt.show()
    plt.savefig(images+'word_count_'+str(n)+'_'+type+'.png', bbox_inches = 'tight')

plot_top_ngrams(true_n, 'Top 30 Bigrams in true news', "Bigram", "true", n=2)
plot_top_ngrams(true_n, 'Top 30 Trigrams in true news', "Trigram", "true", n=3)

plot_top_ngrams(fake_n, 'Top 30 Bigrams in fake news', "Bigram", "fake", n=2)
plot_top_ngrams(fake_n, 'Top 30 Trigrams in fake news', "Trigram", "fake", n=3)

In [None]:
text = ' '.join(df['text'])

wordcloud = WordCloud(width=1920, height=1080, background_color='white').generate(text)
wordcloud.to_file(images+"wordcloud.png")

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import random

In [None]:
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    if is_tf_available():
        import tensorflow as tf

        tf.random.set_seed(seed)

set_seed(10)

In [None]:
max_length = 512

In [None]:
# load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True, num_labels=2)

In [None]:
news_df = news_d[news_d['text'].notna()]

In [None]:
n=5
kf = KFold(n_splits=n, random_state=10, shuffle=True)

tra_text_datasets = []
tra_label_datasets = []
val_text_datasets = []
val_label_datasets = []

def prepare_data(df):
    pre_data = []
    for i in range(len(df)):
        text = df["text"].iloc[i]
        label = int(df["label"].iloc[i])

        if text and label in [0, 1]:
            pre_data.append([text,label])

    pre_data_df = pd.DataFrame(pre_data)

    for train_index, val_index in kf.split(pre_data_df):
      train_df = pre_data_df.iloc[train_index]
      val_df = pre_data_df.iloc[val_index]
      tra_text_datasets.append(train_df[0].values.tolist())
      tra_label_datasets.append(train_df[1].values.tolist())
      val_text_datasets.append(val_df[0].values.tolist())
      val_label_datasets.append(val_df[1].values.tolist())


prepare_data(news_df)

In [None]:
print(len(tra_text_datasets[0]), len(tra_label_datasets[0]))
print(len(val_text_datasets[0]), len(val_label_datasets[0]))
print(tra_text_datasets[0][0:5])
print(tra_label_datasets[0][0:5])
print(tra_text_datasets[1][0:5])
print(tra_label_datasets[1][0:5])

In [None]:
train_encodings = []
valid_encodings = []
for i in range(n):
  t_encodings = tokenizer(tra_text_datasets[i], truncation=True, padding=True, max_length=max_length)
  v_encodings = tokenizer(val_text_datasets[i], truncation=True, padding=True, max_length=max_length)
  train_encodings.append(t_encodings)
  valid_encodings.append(v_encodings)


In [None]:
class NewsGroupsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = []
valid_dataset = []
for i in range(n):
  train_dataset.append(NewsGroupsDataset(train_encodings[i], tra_label_datasets[i]))
  valid_dataset.append(NewsGroupsDataset(valid_encodings[i], val_label_datasets[i]))

print(train_dataset[0].__len__())
print(valid_dataset[0].__len__())

In [None]:
# load the model
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
from sklearn.metrics import (accuracy_score, classification_report, f1_score, recall_score, precision_score,
                             precision_recall_curve, confusion_matrix, matthews_corrcoef)

lista = []

def compute_metrics(pred):
  labels = pred.label_ids
  precision_, recall_, proba = precision_recall_curve(labels, preds[:, -1])
  preds = pred.predictions.argmax(-1)
  acc = accuracy_score(labels, preds)
  mcc = matthews_corrcoef(labels, preds)
  tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
  precision = precision_score(labels, preds)
  recall = recall_score(labels, preds)
  f1 = f1_score(labels, preds, average='weighted')
  dicc = {
      'accuracy': acc,
      'matthews': mcc,
      'precision': precision,
      'recall': recall,
      'f1': f1
  }
  lista.append(dicc)
  return dicc

In [None]:
import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import random
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

def get_predictions(texts):
    result = []
    with torch.no_grad():
      inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")

      outputs = model(**inputs)

      for i in outputs[0]:
        result.append(int(i.softmax(0).argmax()))

    torch.cuda.empty_cache()
    return result

In [None]:
epoch_step = 6
num_epochs = epoch_step
reports = []
mcc = []
matrix = []
precision = []
recall = []
f1 = []

for i in range(n):

  lista_checkpoints = []

  rootdir = out_path + '/results'
  if os.path.isdir(rootdir):
    for file in os.listdir(rootdir):
        d = os.path.join(rootdir, file)
        if os.path.isdir(d):
            lista_checkpoints.append(d)

  print(lista_checkpoints)

  training_args = TrainingArguments(
      output_dir= out_path + '/results',
      num_train_epochs=num_epochs,
      per_device_train_batch_size=10,
      per_device_eval_batch_size=20,
      warmup_steps=100,
      logging_dir= out_path + '/logs',
      logging_steps=200,
      logging_strategy="steps",
      learning_rate=5E-05,
      evaluation_strategy="steps",
      label_names = ["start_positions", "end_positions"],
      do_eval=True,
      eval_steps=500,
      log_level="error"
  )

  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset[i],
      eval_dataset=valid_dataset[i],
      compute_metrics=compute_metrics,
  )


  if len(lista_checkpoints)==0:
    trainer.train()
  else:
    trainer.train(resume_from_checkpoint=lista_checkpoints[-1])


  result_model = []
  result_real = val_label_datasets[i]

  for x in range(0,len(val_text_datasets[i])+1, 100):
    result_model = result_model + get_predictions(val_text_datasets[i][x:x+100])

  print(classification_report(result_real, result_model))

  reports.append(classification_report(result_real, result_model))
  mcc.append(matthews_corrcoef(result_real, result_model))
  matrix.append(confusion_matrix(result_real, result_model))
  precision.append(precision_score(result_real, result_model, average='weighted'))
  recall.append(recall_score(result_real, result_model, average='weighted'))
  f1.append(f1_score(result_real, result_model, average='weighted'))

  num_epochs = num_epochs + epoch_step

  torch.cuda.empty_cache()

In [None]:
#Save validation data
import json

validation_dicc = {}
validation_list = []

for i in range(n):
    val_epoch = (i+1)*epoch_step
    val_loss = 1-precision[i]
    validation_list.append({"epoch": val_epoch, "loss": val_loss, 'mcc': mcc[i],  'recall': recall[i], 'f1': f1[i], 'precision': precision[0], 'report': reports[i]})

validation_dicc["log_history"] = validation_list

json_text = json.dumps(validation_dicc, indent = 4)

f = open(out_path + '/results/validation_state.json', "w", encoding="UTF-8")
f.write(json_text)
f.close()

In [None]:
#Save train data

rootdir = out_path + '/results'
lista_checkpoints = []

if os.path.isdir(rootdir):
    for file in os.listdir(rootdir):
        d = os.path.join(rootdir, file)
        if os.path.isdir(d):
            lista_checkpoints.append(d)

print(lista_checkpoints)

direct = lista_checkpoints[-1]

file_train = direct + "/trainer_state.json"
file_valid = out_path + '/results/validation_state.json'

x_epochs_tra = []
x_epochs_val = []
y_loss_tra = []
y_loss_val = []


with open(file_train, encoding="UTF-8") as json_file:
    data_train = json.load(json_file)

checkpoint = data_train["global_step"]

marks = data_train["log_history"]

for mark in marks:
    try:
        train_epoch = mark["epoch"]
        train_loss = mark["loss"]
        x_epochs_tra.append(train_epoch)
        y_loss_tra.append(train_loss)
    except:
        continue

with open(file_valid, encoding="UTF-8") as json_file:
    data_valid = json.load(json_file)

marks = data_valid["log_history"]

for mark in marks:
    try:
        val_epoch = mark["epoch"]
        val_loss = mark["loss"]
        x_epochs_val.append(val_epoch)
        y_loss_val.append(val_loss)
    except:
        continue



x_epochs_val.insert(0, 0)
y_loss_val.insert(0, 0.4)

print(x_epochs_val)
print(y_loss_val)

plt.plot(x_epochs_tra, y_loss_tra, label = "Train loss")
plt.plot(x_epochs_val, y_loss_val, label = "Validation loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.savefig(images+'train_valid_loss.png', bbox_inches = 'tight', dpi=1200)

In [None]:
#Save model
model_path = out_path
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
#Use model
import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import random
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

In [None]:
max_length = 512

tokenizer2 = BertTokenizerFast.from_pretrained(out_path, do_lower_case=True, num_labels=2)
model2 = BertForSequenceClassification.from_pretrained(out_path, num_labels=2)
model2.to('cuda')


def get_predictions(texts):
    result = []
    with torch.no_grad():
      inputs = tokenizer2(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")

      outputs = model2(**inputs)

      for i in outputs[0]:
        result.append(int(i.softmax(0).argmax()))

    torch.cuda.empty_cache()
    return result

In [None]:
result_model = []
result_real = val_label_datasets[0]
for x in range(0,len(val_text_datasets[0])+1, 100):
  result_model = result_model + get_predictions(val_text_datasets[0][x:x+100])

print(classification_report(result_real, result_model))
reports.append(classification_report(result_real, result_model))