In [50]:
!pip install transformers



In [51]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import torch

In [52]:
data = pd.read_csv("/kaggle/input/arbicfakenews/final_data.csv")

In [53]:
data.head()

Unnamed: 0,title,text,label
0,"فيديو, هل لديك حساسية طعام؟المدة, 25,18",يعاني الكثير الشباب منطقة الشرق الأوسط وشمال أ...,real
1,اخر الاخبار اليوم محافظ المنيا ورئيس الجامعة ي...,الدكتور مصطفي عبد النبي رئيس جامعة المنيا والل...,fake
2,مدبولي يتابع الموقف التنفيذي لمشروع تطوير وتنم...,وأكد رئيس الوزراء المشروع القومي الكبير سيتم إ...,real
3,تسرب بسببها فصل بالكامل.. فاطمة رشدى ضربت الطا...,شكرا لقرائتكم خبر تسرب بسببها فصل بالكامل فاطم...,fake
4,سقوط تشكيل عصابي للاتجار بالمخدرات وحيازة الأس...,سقوط تشكيل عصابي للاتجار بالمخدرات وحيازة الأس...,real


In [54]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366000 entries, 0 to 365999
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   title   365999 non-null  object
 1   text    365923 non-null  object
 2   label   365924 non-null  object
dtypes: object(3)
memory usage: 8.4+ MB


In [55]:
data.describe().T

Unnamed: 0,count,unique,top,freq
title,365999,357533,نصاف بن علية تتلقّى الجرعة الأولى من لقاح فايزر,6
text,365923,365923,كتب دينا دهب نشرت المواقع الإخبارية الساعات ال...,1
label,365924,2,real,203772


In [56]:
data.isnull().sum()

title     1
text     77
label    76
dtype: int64

In [57]:
data = data.dropna()
data = data.dropna(subset=['text', 'label'])

In [58]:
data.isnull().sum()

title    0
text     0
label    0
dtype: int64

In [59]:
data.shape

(365922, 3)

In [60]:
data['text_length'] = data['text'].apply(lambda x: len(str(x)))

In [61]:
data['text_length'].describe
min_length = data['text_length'].min()
min_length

6

In [62]:
num_rows_before = len(data)
data = data[data['text_length'] >= 20]
num_rows_after = len(data)
num_rows_deleted = num_rows_before - num_rows_after
print("Number of rows deleted:", num_rows_deleted)

Number of rows deleted: 15


In [63]:
data['text_length'].describe
min_length = data['text_length'].min()
min_length

20

In [64]:
data['label'].value_counts()

label
real    203766
fake    162141
Name: count, dtype: int64

In [65]:
# Calculate the total number of samples
total_samples = data['label'].count()

# Calculate the percentage for each label
data['label_percentage'] = (data.groupby('label')['label'].transform('count') / total_samples) * 100

# Display the result
print(data[['label', 'label_percentage']].drop_duplicates())

  label  label_percentage
0  real         55.687921
1  fake         44.312079


In [66]:
data['label'] = data['label'].replace({'fake': 1, 'real': 0})

  data['label'] = data['label'].replace({'fake': 1, 'real': 0})


In [67]:
data = data.drop(['label_percentage'], axis=1)
data.head()

Unnamed: 0,title,text,label,text_length
0,"فيديو, هل لديك حساسية طعام؟المدة, 25,18",يعاني الكثير الشباب منطقة الشرق الأوسط وشمال أ...,0,1566
1,اخر الاخبار اليوم محافظ المنيا ورئيس الجامعة ي...,الدكتور مصطفي عبد النبي رئيس جامعة المنيا والل...,1,981
2,مدبولي يتابع الموقف التنفيذي لمشروع تطوير وتنم...,وأكد رئيس الوزراء المشروع القومي الكبير سيتم إ...,0,2784
3,تسرب بسببها فصل بالكامل.. فاطمة رشدى ضربت الطا...,شكرا لقرائتكم خبر تسرب بسببها فصل بالكامل فاطم...,1,1222
4,سقوط تشكيل عصابي للاتجار بالمخدرات وحيازة الأس...,سقوط تشكيل عصابي للاتجار بالمخدرات وحيازة الأس...,0,853


In [68]:
data['label'].unique()

array([0, 1])

In [69]:
data['text'] = data.apply(lambda x: x['text'][:200] if x['text_length'] > 200 else x['text'], axis=1)

In [70]:
data['text_length'] = data['text'].apply(lambda x: len(str(x)))

In [71]:
data.head()

Unnamed: 0,title,text,label,text_length
0,"فيديو, هل لديك حساسية طعام؟المدة, 25,18",يعاني الكثير الشباب منطقة الشرق الأوسط وشمال أ...,0,200
1,اخر الاخبار اليوم محافظ المنيا ورئيس الجامعة ي...,الدكتور مصطفي عبد النبي رئيس جامعة المنيا والل...,1,200
2,مدبولي يتابع الموقف التنفيذي لمشروع تطوير وتنم...,وأكد رئيس الوزراء المشروع القومي الكبير سيتم إ...,0,200
3,تسرب بسببها فصل بالكامل.. فاطمة رشدى ضربت الطا...,شكرا لقرائتكم خبر تسرب بسببها فصل بالكامل فاطم...,1,200
4,سقوط تشكيل عصابي للاتجار بالمخدرات وحيازة الأس...,سقوط تشكيل عصابي للاتجار بالمخدرات وحيازة الأس...,0,200


In [72]:
num_rows_before = len(data)
data = data.drop_duplicates(subset=['text'])
num_rows_after = len(data)
num_rows_deleted = num_rows_before - num_rows_after
print("Number of rows deleted:", num_rows_deleted)

Number of rows deleted: 6439


In [73]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 359468 entries, 0 to 365999
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   title        359468 non-null  object
 1   text         359468 non-null  object
 2   label        359468 non-null  int64 
 3   text_length  359468 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 13.7+ MB


In [74]:
total_samples = data['label'].count()
data['label_percentage'] = (data.groupby('label')['label'].transform('count') / total_samples) * 100
print(data[['label', 'label_percentage']].drop_duplicates())

   label  label_percentage
0      0         55.948235
1      1         44.051765


In [75]:
df = data.sample(n=10000, random_state=1)

In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 183080 to 195339
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             10000 non-null  object 
 1   text              10000 non-null  object 
 2   label             10000 non-null  int64  
 3   text_length       10000 non-null  int64  
 4   label_percentage  10000 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 468.8+ KB


In [77]:
total_samples = df['label'].count()
df['label_percentage'] = (df.groupby('label')['label'].transform('count') / total_samples) * 100
print(df[['label', 'label_percentage']].drop_duplicates())

        label  label_percentage
183080      1             44.91
159601      0             55.09


=====================================================================================================

In [78]:
train_texts, test_texts, train_labels, test_labels = train_test_split(df['text'], df['label'], test_size=0.2)

In [79]:
train_texts.shape

(8000,)

In [80]:
test_texts.shape

(2000,)

In [81]:
train_labels.shape

(8000,)

In [82]:
test_labels.shape

(2000,)

In [83]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=128, return_tensors="pt")

In [84]:
train_encodings

{'input_ids': tensor([[  101,  1282, 29830,  ..., 23673, 25573,   102],
        [  101,  1015,  1273,  ...,  1273, 29820,   102],
        [  101,  1270, 23673,  ...,     0,     0,     0],
        ...,
        [  101,  1270, 23673,  ..., 19433,  2324,   102],
        [  101,  1295, 29820,  ..., 29817, 29835,   102],
        [  101,  1288, 29816,  ..., 25573, 29826,   102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [85]:
test_encodings

{'input_ids': tensor([[  101,  1298, 29835,  ..., 19433,  1296,   102],
        [  101,  1292, 29828,  ..., 22192, 15394,   102],
        [  101,  1270, 23673,  ..., 23673, 14498,   102],
        ...,
        [  101,  1288, 25573,  ..., 29826, 25573,   102],
        [  101,  1270, 17149,  ..., 23673,  1270,   102],
        [  101,  1273, 29820,  ..., 17149, 14498,   102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [86]:
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_labels.tolist()))
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(test_labels.tolist()))


In [87]:
train_dataset

<torch.utils.data.dataset.TensorDataset at 0x7e2813b0e650>

In [88]:
test_dataset

<torch.utils.data.dataset.TensorDataset at 0x7e2813b0d420>

In [89]:
# DataLoader
train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=16)
test_loader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=16)


In [90]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7e2813b0f3a0>

In [91]:
test_loader

<torch.utils.data.dataloader.DataLoader at 0x7e2813b0c490>

In [92]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

optimizer = AdamW(model.parameters(), lr=2e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [93]:
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup

epochs = 15
total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [94]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_attention_mask, b_labels = batch
        optimizer.zero_grad()

        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)

        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    print(f"Epoch {epoch+1}/{epochs} Loss: {total_loss/len(train_loader)}")

model.eval()
predictions, true_labels = [], []



Epoch 1/15 Loss: 0.6590947328209877
Epoch 2/15 Loss: 0.554445020198822
Epoch 3/15 Loss: 0.4885511658489704
Epoch 4/15 Loss: 0.4578479617238045
Epoch 5/15 Loss: 0.43471373203396796
Epoch 6/15 Loss: 0.40664354485273363
Epoch 7/15 Loss: 0.3767616932541132
Epoch 8/15 Loss: 0.3404023620784283
Epoch 9/15 Loss: 0.30602585542201993
Epoch 10/15 Loss: 0.27134665636718275
Epoch 11/15 Loss: 0.2328697976768017
Epoch 12/15 Loss: 0.2097666943576187
Epoch 13/15 Loss: 0.18735692460648715
Epoch 14/15 Loss: 0.1713517350386828
Epoch 15/15 Loss: 0.15720759403798729


In [95]:
for batch in test_loader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_attention_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_attention_mask)

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    predictions.append(logits)
    true_labels.append(label_ids)

predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)
pred_flat = np.argmax(predictions, axis=1).flatten()
labels_flat = true_labels.flatten()

accuracy = accuracy_score(labels_flat, pred_flat)
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 0.7275


In [96]:
from sklearn.metrics import precision_recall_fscore_support

# Calculate precision, recall, f1-score, and support
precision, recall, f1_score, support = precision_recall_fscore_support(labels_flat, pred_flat)

# Print precision, recall, f1-score, and support
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)
print("Support:", support)

Precision: [0.7690957  0.67247387]
Recall: [0.75647668 0.68764846]
F1 Score: [0.762734   0.67997651]
Support: [1158  842]


In [97]:
from sklearn.metrics import confusion_matrix


# Calculate confusion matrix
conf_matrix = confusion_matrix(labels_flat, pred_flat)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[876 282]
 [263 579]]


In [98]:
import joblib
# Save the model
model_path = "bert_sequence_classification_model.joblib"
joblib.dump(model, model_path)
print(f"Model saved to {model_path}")

Model saved to bert_sequence_classification_model.joblib
