<a href="https://colab.research.google.com/github/ilkayyagizgur/Fake_News_Training/blob/main/Fake_News_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download and Import Required Libraries


In [None]:
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

In [None]:
!pip install transformers
!pip install datasets
!pip install numpy
!pip install evaluate

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from datasets import Dataset
import pyarrow as pa
import pandas as pd
import numpy as np
import evaluate
import torch

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Model Settings


In [None]:
choosen_model ='dbmdz/bert-base-turkish-cased'
number_labels = 2

In [None]:
dataset_location ='/content/drive/MyDrive/Bitirme/LastDatasetWithAllAug.csv'

In [None]:
trainer_output_dir = "/content/drive/MyDrive/Berturk-cased-model"

In [None]:
saving_location = '/content/drive/MyDrive/Bert/Models/Berturk-cased-model'
loading_location = saving_location

# Choose model and Download


In [None]:
tokenizer = AutoTokenizer.from_pretrained(choosen_model)
model = AutoModelForSequenceClassification.from_pretrained(
    choosen_model,
    num_labels=number_labels
)

In [None]:
metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

# Defining Required Functions


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metrics.compute(predictions=predictions, references=labels)

In [None]:
def process_data(row):

    text = row['Orj_Text']
    text = str(text)
    text = ' '.join(text.split())

    encodings = tokenizer(text,
                          padding="max_length",
                          truncation=True,
                          max_length=128)

    label = 0
    if row['label'] == 1:
        label += 1

    encodings['labels'] = label
    encodings['text'] = text

    return encodings

# Import and Procces Dataset



In [None]:
dataset = pd.read_csv(dataset_location)
dataset.head()

In [None]:
processed_dataset = []

for i in range(len(dataset[:20037])):
    processed_dataset.append(process_data(dataset.iloc[i]))

In [None]:
processed_dataset[0]

In [None]:
new_df = pd.DataFrame(processed_dataset)

train_df, valid_df = train_test_split(
    new_df,
    test_size=0.2,
    random_state=42
)

In [None]:
train_dataset = Dataset(pa.Table.from_pandas(train_df))
valid_dataset = Dataset(pa.Table.from_pandas(valid_df))

# Setting Up Trainer

In [None]:
training_args = TrainingArguments(output_dir=trainer_output_dir, evaluation_strategy="epoch")

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train and Evaluate


In [None]:
trainer.train()

In [None]:
trainer.evaluate()

# Save Model


In [None]:
model.save_pretrained(saving_location)

In [None]:
tokenizer.save_pretrained(saving_location, legacy_format=False)

# Load Model


In [None]:
loaded_model = AutoModelForSequenceClassification.from_pretrained(loading_location)

new_tokenizer = AutoTokenizer.from_pretrained(choosen_model)

# Prediction


In [None]:
def get_prediction(text):
    encoding = new_tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    encoding = {k: v.to(loaded_model.device) for k,v in encoding.items()}

    outputs = loaded_model(**encoding)

    logits = outputs.logits

    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(logits.squeeze().cpu())
    probs = probs.detach().numpy()
    label = np.argmax(probs, axis=-1)
    
    return label
      

In [None]:
get_prediction('Son dakika: Savunma Sanayisinden çok önemli proje! Ve imzalar atıldı')

# Accuracy

In [None]:
accuracy = evaluate.load("accuracy")

In [None]:
prediction  = []
referance = []
print(len(valid_df))

for i in range(len(valid_df)):
  var = valid_df.iloc[i]["labels"]
  vartwo = get_prediction(valid_df.iloc[i]["text"])
  print(i)

  if var == 0 :
    varthree = 0
  else:
    varthree = 1

  referance.append(varthree)
  prediction.append(vartwo)



accuracy.add_batch(references=referance , predictions=prediction)

accuracy.compute()


In [None]:
metrics.add_batch(references=referance , predictions=prediction)

metrics.compute()


In [None]:
for i in range(len(prediction)):
  print(prediction[i],referance[i])

# Azure Back Translation


In [None]:
# Add your key and endpoint
key = ""
endpoint = "https://api.cognitive.microsofttranslator.com"

# location, also known as region.
# required if you're using a multi-service or regional (not global) resource. It can be found in the Azure portal on the Keys and Endpoint page.
location = "westeurope"

path = '/translate'
constructed_url = endpoint + path

In [None]:
params = {
    'api-version': '3.0',
    'from': 'tr',
    'to': 'en'
}

headers = {
    'Ocp-Apim-Subscription-Key': key,
    # location required if you're using a multi-service or regional (not global) resource.
    'Ocp-Apim-Subscription-Region': location,
    'Content-type': 'application/json',
    'X-ClientTraceId': str(uuid.uuid4())
}

In [None]:
request = requests.post(constructed_url, params=params, headers=headers, json=body)
response = request.json() 
print(  json.dumps(response, sort_keys=True, ensure_ascii=False, indent=4, separators=(',', ': ')))

In [None]:
def TranslateText(text):
  text = text.replace("\n"," ")
  body = [{'text':text}]
  request = requests.post(constructed_url, params=params, headers=headers, json=body)
  response = request.json() 
  return response[0]["translations"][0]["text"]

# Data Augmentation


## Non model augmentations


In [None]:
sub_char_by_key = nac.KeyboardAug(aug_word_min=10, aug_word_max=30)

In [None]:
swap_char = nac.RandomCharAug(action="swap", aug_word_min=10, aug_word_max=30)

In [None]:
delete_char = nac.RandomCharAug(action="delete", aug_word_min=10, aug_word_max=30)

In [None]:
delete_word = naw.RandomWordAug(aug_min = 10, aug_max = 30)

In [None]:
delete_set_words = naw.RandomWordAug(action='crop', aug_min = 10, aug_max = 30)

In [None]:
split_word = naw.SplitAug(aug_min = 10, aug_max = 30)

In [None]:
swap_words = naw.RandomWordAug(action="swap",aug_min = 10, aug_max = 30)

In [None]:
def augmentation(df,augmentationModel):
  fake_text=df["Orj_Text"]
  fake_text=fake_text.str.replace("\n", " ")
  texts = list(fake_text)
  aug_list=[]
  for i in range(len(texts)):
    if len(texts[i].split()) <= 10 and augmentationModel == delete_set_words :
      dsw_short= naw.RandomWordAug(action='crop', aug_min = 2)
      augmented_text = dsw_short.augment(texts[i])
    else:
      augmented_text = augmentationModel.augment(texts[i])
    aug_list.append(augmented_text)
    print("Augmented Text:")
    print(augmented_text)
  return aug_list

## Augmentations With Models


In [None]:
insert_berturk = naw.ContextualWordEmbsAug(
      model_path='dbmdz/bert-base-turkish-cased', action="insert", aug_min = 10, aug_max = 30)
insert_convberturk = naw.ContextualWordEmbsAug(
      model_path='dbmdz/convbert-base-turkish-cased', action="insert", aug_min = 10, aug_max = 30)

augmentations=[insert_Berturk,insert_Convberturk]

In [None]:
substitute_berturk = naw.ContextualWordEmbsAug(
      model_path='dbmdz/bert-base-turkish-cased', action="substitute", aug_min = 10, aug_max = 30)
substitute_convberturk = naw.ContextualWordEmbsAug(
      model_path='dbmdz/convbert-base-turkish-cased', action="substitute", aug_min = 10, aug_max = 30)

augmentations=[substitute_berturk,substitute_convberturk]

In [None]:
def augmentation(df):
  fake_text=df["Orj_Text"]
  fake_text=fake_text.str.replace("\n", " ")
  texts = list(fake_text)
  aug_list=[]
  for j in range(len(augmentations)):
    print(augmentations[j])
    for i in range(len(texts)):
      augmented_text = augmentations[j].augment(texts[i])
      aug_list.append(augmented_text)
      print("Augmented Text:")
      print(augmented_text)
  return aug_list

# Stop Word Removal


In [None]:
#nltk.download()

In [None]:

example_sent = """This is a sample sentence,
                  showing off the stop words filtration."""
  
stop_words = set(stopwords.words('english'))
  
word_tokens = word_tokenize(example_sent)
# converts the words in word_tokens to lower case and then checks whether 
#they are present in stop_words or not
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
#with no lower case conversion
filtered_sentence = []
  
for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)
  
print(word_tokens)
print(filtered_sentence)

In [None]:
def remove_stop_words (text):
  word_tokens = word_tokenize(text)
  filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
  result = ""
  for w in word_tokens:
    if w not in stop_words:
        result += w
        result +=  " "
  return result 