In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
!pip install ipython-autotime
%load_ext autotime
!pip install -q swifter

time: 6.23 s (started: 2023-07-26 07:01:28 +00:00)


In [3]:
%cd drive/MyDrive/Neuromatch

/content/drive/MyDrive/Neuromatch
time: 3.03 ms (started: 2023-07-26 07:01:34 +00:00)


In [4]:
# Basic imports
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

# torch imports
import torch
import torch.nn as nn

# svg imports
from IPython import display
display.set_matplotlib_formats('svg')

time: 2.55 s (started: 2023-07-26 07:01:34 +00:00)


  display.set_matplotlib_formats('svg')


In [5]:
true_data = pd.read_csv('datasets/True.csv')
fake_data = pd.read_csv('datasets/Fake.csv')

time: 2.38 s (started: 2023-07-26 07:01:42 +00:00)


In [6]:
print('fake shape: ', fake_data.shape)
print('True shape: ', true_data.shape)

fake shape:  (23481, 4)
True shape:  (21417, 4)
time: 755 µs (started: 2023-07-26 07:01:55 +00:00)


In [7]:
true_data["label"] = np.ones(len(true_data),dtype=int)
fake_data["label"] = np.zeros(len(fake_data),dtype=int)

time: 3.74 ms (started: 2023-07-26 07:02:32 +00:00)


In [8]:
fake = fake_data[['title', 'text', 'label']]
true = true_data[['title', 'text', 'label']]

time: 45.3 ms (started: 2023-07-26 07:03:29 +00:00)


In [10]:
data = pd.concat([fake, true], ignore_index = True, sort = False)
data.head()

Unnamed: 0,title,text,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,0


time: 33.2 ms (started: 2023-07-26 07:05:42 +00:00)


In [11]:
data = data.sample(frac = 1)
data.head()

Unnamed: 0,title,text,label
44767,"In war-torn Darfur, new U.S. aid chief stresse...","ZAM ZAM CAMP, North Darfur (Reuters) - Washing...",1
32490,Factbox: Why the Zika virus is causing alarm,Global health officials are racing to better u...,1
298,"Trump’s Press Secretary Falls Apart, Exposes ...",Anyone who is faced with the unfortunate task ...,0
970,GOP Operative Connected To Trump’s Ousted Top...,An explosive report in the Wall Street Journal...,0
12259,REPUBLICAN WINNING STREAK CONTINUES: Democrats...,Rep. Tim Ryan (D. Ohio) had launched a bid to ...,0


time: 51.3 ms (started: 2023-07-26 07:05:51 +00:00)


In [12]:
data['texts'] = data['title'] + ' '+ data['text']

time: 164 ms (started: 2023-07-26 07:06:23 +00:00)


In [13]:
data.isnull().sum()

title    0
text     0
label    0
texts    0
dtype: int64

time: 70.4 ms (started: 2023-07-26 07:15:13 +00:00)


In [14]:
if torch.cuda.is_available():
    device = torch.device('cuda')

    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print("NO GPU available. So, switched to CPU")
    device = torch.device('cpu')

There are 1 GPU(s) available.
We will use the GPU: Tesla T4
time: 204 ms (started: 2023-07-26 07:15:26 +00:00)


In [15]:
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig

time: 5.99 s (started: 2023-07-26 07:15:51 +00:00)


In [16]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',                  # model name
    num_labels = 2,                       # total number of labels
    output_attentions = False,            # Whether the model returns attention weights
    output_hidden_states = False)         # Whether the model returns all hidden-state
model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

time: 11.6 s (started: 2023-07-26 07:16:22 +00:00)


In [None]:
sentences = data.texts.values
labels = data.label.values

# sentences = sentences[:5000]
# labels = labels[:5000]
# store the input_ids and attention_masks
input_ids = []
attention_masks = []

for sent in tqdm(sentences):
    encoded_dict = tokenizer.encode_plus(
    sent,                               # taking each sentence and process
    add_special_tokens = True,          # adding [CLS] + sentences + [SEP]
    max_length = 75,                    # maximum length of the sentences
    pad_to_max_length = True,
    return_attention_mask = True,       # Getting attention mask [0,0,1,1]
    return_tensors = 'pt')              # It will return the output as pytorch format

    input_ids.append(encoded_dict['input_ids'])     # appending
    attention_masks.append(encoded_dict['attention_mask'])

  0%|          | 0/44898 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
 65%|██████▍   | 29040/44898 [07:18<02:29, 106.13it/s]

In [None]:
# Conver the input_ids, attention_masks, and labels to tensor!
input_ids = torch.cat(input_ids, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

In [None]:
print(type(input_ids))

print(type(attention_masks))

print(type(labels))

print(np.unique(labels))

print('\nOriginal: ', sentences[1])
print('\nToken IDs: ', input_ids[1])
print('\n Label: ', labels[1])

In [None]:
# Seperate Training and Validation split!
from torch.utils.data import TensorDataset, random_split

# combine the all inputs to tensor dataset
dataset = TensorDataset(input_ids, attention_masks, labels)

# determine the range of split
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

# divide the split
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [None]:
# Let's create a DataLoader
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

## Dataset needs the batch size for training, recomended batch size is 16 or 32
batch_size = 32

## create a dataloader for our training and validation split
train_dataloader = DataLoader(train_dataset,
                             sampler = RandomSampler(train_dataset),  # we need to randomize the training data
                             batch_size = batch_size )
validation_dataloader = DataLoader(val_dataset,
                                  sampler = SequentialSampler(val_dataset),  # we need to infer the test data sequentially!
                                  batch_size = batch_size)

In [None]:
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [None]:
optimizer = AdamW(model.parameters(),
                 lr = 5e-5,
                 eps = 1e-8)  # epsilion rate

from transformers import get_linear_schedule_with_warmup
# Number of training epochs. The BERT authors recommend between 2 and 4.
epochs = 2

# The Total Number of training steps is [number of batches] * [number of epochs]
total_steps = len(train_dataloader) * epochs

# create a learning rate scheduler!
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis =1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


#  create a helper function to get the time
import time
import random
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
# setting the seeds!
SEED_VAL = 66
random.seed(SEED_VAL)
np.random.seed(SEED_VAL)
torch.manual_seed(SEED_VAL)
torch.cuda.manual_seed_all(SEED_VAL)

# We'll store a number of quantities such as training and validation loss,
# validation accuracy, and timings.
training_stats = []


# total training time!
total_t0 = time.time()

for epoch_i in tqdm(range(0, epochs)):
    t0 = time.time()  # start time

    # reset the total loss for this epoch
    total_train_loss = 0

    model.train()  # make our model to train mode

    for step, batch in enumerate(train_dataloader):
        """
        batch[0] -> input_ids
        batch[1] -> attention_mask
        batch[2] -> labels
        """

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Before starting training we need to make the gradinet as zero
        model.zero_grad()

        output = model(b_input_ids, token_type_ids = None, attention_mask = b_input_mask, labels = b_labels)
        loss = output[0]
        logits = output[1]

        total_train_loss += loss.item()
        loss.backward()
        # clip the norm of the gradients to 1.0
        # This is to help prevent the "exploading gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = format_time(time.time() - t0)

    # validation
    t0 = time.time()
    model.eval()  # make model to evaluvation

    # tracking variables
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            output = model(b_input_ids, attention_mask = b_input_mask, labels = b_labels, token_type_ids = None)
            loss = output[0]
            logits = output[1]

        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()   # move variable GPU to CPU
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    avg_val_loss = total_eval_loss / len(validation_dataloader)

    validation_time = format_time(time.time() - t0)
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [None]:
# Display floats with two decimal places.
pd.set_option('precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

In [7]:
data = pd.concat((true_data,fake_data),axis=0)

time: 7.84 ms (started: 2023-07-26 06:14:42 +00:00)


In [8]:
data = data.drop("date",axis=1)

time: 9.66 ms (started: 2023-07-26 06:14:42 +00:00)


In [11]:
%cd ..

/content/drive/MyDrive
time: 2.72 ms (started: 2023-07-26 06:16:21 +00:00)


In [12]:
!ls

'Colab Notebooks'		  FND		   kaggle.json	    Neuromatch
 fake-and-real-news-dataset.zip   Invoice.gsheet   MPRNet-Article
time: 105 ms (started: 2023-07-26 06:16:27 +00:00)


In [13]:
%cd FND/Fake-News-Detection/

/content/drive/MyDrive/FND/Fake-News-Detection
time: 2.23 ms (started: 2023-07-26 06:16:53 +00:00)


In [None]:
import nltk
nltk.download('all')

In [16]:
import swifter
from preprocess import *
print(data.shape)
data.reset_index(drop=True, inplace = True)
data['cleaned_text'] = data['text'].swifter.apply(lambda x : preprocess_text(x))
data['cleaned_title'] = data['title'].swifter.apply(lambda x : preprocess_text(x))

(44898, 4)


Pandas Apply:   0%|          | 0/44898 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/44898 [00:00<?, ?it/s]

time: 7min 28s (started: 2023-07-26 06:20:12 +00:00)


In [17]:
data.head(10)

Unnamed: 0,title,text,subject,label,cleaned_text,cleaned_title
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,1,"[washington, reuter, head, conserv, republican...","[u, budget, fight, loom, republican, flip, fis..."
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,1,"[washington, reuter, transgend, peopl, allow, ...","[u, militari, accept, transgend, recruit, mond..."
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,1,"[washington, reuter, special, counsel, investi...","[senior, u, republican, senat, let, mr, muelle..."
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,1,"[washington, reuter, trump, campaign, advis, g...","[fbi, russia, probe, help, australian, diploma..."
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,1,"[seattl, washington, reuter, presid, donald, t...","[trump, want, postal, servic, charg, much, ama..."
5,"White House, Congress prepare for talks on spe...","WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...",politicsNews,1,"[west, palm, beach, fla, washington, reuter, w...","[white, hous, congress, prepar, talk, spend, i..."
6,"Trump says Russia probe will be fair, but time...","WEST PALM BEACH, Fla (Reuters) - President Don...",politicsNews,1,"[west, palm, beach, fla, reuter, presid, donal...","[trump, say, russia, probe, fair, timelin, unc..."
7,Factbox: Trump on Twitter (Dec 29) - Approval ...,The following statements were posted to the ve...,politicsNews,1,"[follow, statement, post, verifi, twitter, acc...","[factbox, trump, twitter, dec, approv, rate, a..."
8,Trump on Twitter (Dec 28) - Global Warming,The following statements were posted to the ve...,politicsNews,1,"[follow, statement, post, verifi, twitter, acc...","[trump, twitter, dec, global, warm]"
9,Alabama official to certify Senator-elect Jone...,WASHINGTON (Reuters) - Alabama Secretary of St...,politicsNews,1,"[washington, reuter, alabama, secretari, state...","[alabama, offici, certifi, senat, elect, jone,..."


time: 61.7 ms (started: 2023-07-26 06:28:39 +00:00)


In [18]:
data.shape

(44898, 6)

time: 6.47 ms (started: 2023-07-26 06:31:17 +00:00)


In [19]:
data = data.dropna()
data.shape

(44898, 6)

time: 697 ms (started: 2023-07-26 06:31:18 +00:00)


In [22]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split

from transformers import AutoTokenizer

time: 12.9 ms (started: 2023-07-26 06:35:50 +00:00)


In [23]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

time: 452 ms (started: 2023-07-26 06:35:55 +00:00)


In [None]:
for text in data['text'].values:
  print(text)
  print()

In [29]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Replace 'text_list' with your list of original text data
# text_list = ["washington", "reuter"]
text_list = [['washington', 'reuter', 'eight', 'democrat']]

# Tokenize text and convert to token IDs
tokenized_texts = [tokenizer.tokenize(text) for text in text_list]
input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokenized_texts]


TypeError: ignored

time: 216 ms (started: 2023-07-26 06:46:26 +00:00)


In [28]:
print(tokenized_texts)
print(input_ids)

[['washington'], ['re', '##uter']]
[[2899], [2128, 19901]]
time: 1.13 ms (started: 2023-07-26 06:45:40 +00:00)


In [25]:
for text in data['text'].values:
    encoded = tokenizer.encode_plus(str(text).lower())


print(encoded)

Token indices sequence length is longer than the specified maximum sequence length for this model (1944 > 512). Running this sequence through the model will result in indexing errors


KeyboardInterrupt: ignored

time: 1min 9s (started: 2023-07-26 06:37:11 +00:00)
