<a href="https://colab.research.google.com/github/leman-cap13/my_projects/blob/main/Real_%26_Fake_News.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download razanaqvi14/real-and-fake-news

In [None]:
import zipfile
zip_ref = zipfile.ZipFile('/content/real-and-fake-news.zip', 'r')
zip_ref.extractall()


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df_true=pd.read_csv('/content/True.csv')

In [None]:
df_fake=pd.read_csv('/content/Fake.csv')

In [None]:
df_true

In [None]:
df_fake

In [None]:
df_true.isna().sum()


In [None]:
df_fake.isna().sum()

In [None]:
# Hər dataframe-ə label əlavə et:
df_true["label"] = 1  # Real news
df_fake["label"] = 0  # Fake news

In [None]:
df = pd.concat([df_true, df_fake], ignore_index=True)

In [None]:
df

In [None]:
#Data-ı qarışdır (shuffle):
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

frac=1 – bütün datanı al

sample() – random olaraq qarışdır

reset_index(drop=True) – indexləri sıfırdan başlat

In [None]:
df

In [None]:
df["content"] = df["title"] + " " + df["text"] # title ve text ikisinide istifade etmek isteyirem deye birlesdirdim
X = df["content"].values
y = df["label"].values

In [None]:
y

In [None]:
#indi mene tokenler elde etmek lazimdi bunun ucun AutoTokenizer isledecem
from transformers import AutoTokenizer, AutoModel
import torch

model_ckpt = 'distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [None]:
# embedding class
def get_embedding(text):
  inputs=tokenizer(text, return_tensors='pt',truncation=True,padding=True,max_length=512)
  inputs = {k: v.to(device) for k, v in inputs.items()}
  with torch.no_grad():
    outputs=model(**inputs)
  cls_embedding=outputs.last_hidden_state[:,0,:]
  return cls_embedding.squeeze().cpu().numpy()

In [None]:
#embeddinglerini gotur
embeddings = []
for text in df['content']:
    emb = get_embedding(text)
    embeddings.append(emb)

In [None]:
#X i update et
X=np.array(embeddings)

In [None]:
X

In [None]:
y


In [None]:
#Trian test e bolmek ucun train_test_split istifade etdim
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
from sklearn.linear_model import LogisticRegression
lr_clf=LogisticRegression()
lr_clf.fit(X_train, y_train)

In [None]:
lr_clf.score(X_test, y_test)

In [None]:
lr_clf.score(X_train, y_train)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

def plot_confusion_matrix(y_preds, y_true, labels):
  cm=confusion_matrix(y_true, y_preds, normalize='true')
  fig,ax=plt.subplots(figsize=(6,6))
  disp=ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
  disp.plot(cmap='Blues', values_format='.2f', ax=ax, colorbar=False)
  plt.title('Normalized confusion matrix')
  plt.show()
y_preds=lr_clf.predict(X_test)
plot_confusion_matrix(y_preds,y_test, labels=['Real', 'Fake'])

In [None]:
sample_text = df['content'][0]

# Yenidən tokenləşdir:
inputs = tokenizer(sample_text, return_tensors='pt', truncation=True, padding=True, max_length=512)

# Token id-ləri geri çevirmək:
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
print(tokens)


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_preds))

#fine tuning

In [None]:
#fine tuning

from transformers import  AutoModelForSequenceClassification
num_labels=2
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device)

In [None]:
# metrics hazirlayaq
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
  labels=pred.label_ids
  preds=pred.predictions.argmax(-1)
  f1=f1_score(labels, preds, average='weighted')
  acc=accuracy_score(labels, preds)
  return {'accuracy': acc, 'f1': f1}

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from transformers import TrainingArguments

batch_size=64

model_name=f'{model_ckpt}-finetuned-fake_true_news'
training_args=TrainingArguments(
    output_dir=model_name,  #main
    num_train_epochs=2,  #main
    learning_rate=2e-5,   #main
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    eval_strategy='epoch',
    disable_tqdm=False,
    push_to_hub=True,
    log_level='error'
)

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["content"].tolist(),
    df["label"].tolist(),
    test_size=0.2,
    random_state=42
)

In [None]:
from datasets import Dataset

train_dataset = Dataset.from_dict({
    "text": train_texts,
    "label": train_labels
})

val_dataset = Dataset.from_dict({
    "text": val_texts,
    "label": val_labels
})


In [None]:
def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

In [None]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.remove_columns(["text"])
val_dataset = val_dataset.remove_columns(["text"])

In [None]:
from transformers import Trainer

trainer=Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

In [None]:
preds_output=trainer.predict(val_dataset)

In [None]:
preds_output

In [None]:
trainer.push_to_hub(commit_message='Training completed')