# GAN To Generate Fake Data

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

# Load dataset
df = pd.read_csv("/kaggle/input/threads-fake-news-dataset/NewThreadsData_Labeled.csv", encoding='ISO-8859-1')

# Ensure 'Post Text' column is string and handle missing values
df["Post Text"] = df["Post Text"].astype(str).fillna("")

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Apply tokenizer safely
df["tokenized_text"] = df["Post Text"].apply(lambda x: tokenizer.encode(x, truncation=True, padding="max_length", max_length=128))

# Split data
train_texts, val_texts = train_test_split(df["tokenized_text"], test_size=0.2, random_state=42)

print("Tokenization successful!")


In [2]:
import torch.optim as optim
import torch.nn as nn
import torch

# Hyperparameters
vocab_size = len(tokenizer.vocab)  # Ensure tokenizer is defined
embedding_dim = 128
hidden_dim = 256
seq_length = 128
batch_size = 64
num_epochs = 10

# Define Generator and Discriminator classes
class Generator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, seq_length):
        super(Generator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, noise):
        embedded = self.embedding(noise)  # Shape: (batch_size, seq_length, embedding_dim)
        lstm_out, _ = self.lstm(embedded)  # Shape: (batch_size, seq_length, hidden_dim)
        logits = self.fc(lstm_out)  # Shape: (batch_size, seq_length, vocab_size)
        return logits  # Return logits, not softmaxed values

class Discriminator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, seq_length):
        super(Discriminator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)  # Scalar output per sequence
        self.sigmoid = nn.Sigmoid()

    def forward(self, sequences):
        embedded = self.embedding(sequences)  # (batch_size, seq_length, embedding_dim)
        lstm_out, _ = self.lstm(embedded)  # (batch_size, seq_length, hidden_dim)
        logits = self.fc(lstm_out[:, -1, :])  # Use last hidden state
        return self.sigmoid(logits)  # Output a single probability for each sequence

# Initialize models and move to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
generator = Generator(vocab_size, embedding_dim, hidden_dim, seq_length).to(device)
discriminator = Discriminator(vocab_size, embedding_dim, hidden_dim, seq_length).to(device)

# Optimizers
g_optimizer = optim.Adam(generator.parameters(), lr=0.0002)
d_optimizer = optim.Adam(discriminator.parameters(), lr=0.0002)

# Loss function
criterion = nn.BCELoss()

# Ensure train_texts is a properly formatted tensor
train_tensor = torch.tensor(train_texts.tolist(), dtype=torch.long, device=device)  # Convert list to tensor

# Training loop
for epoch in range(num_epochs):
    for i in range(0, len(train_tensor), batch_size):
        real_sequences = train_tensor[i:i + batch_size]
        batch_size_actual = real_sequences.shape[0]  # Get the actual batch size

        # Add small Gaussian noise to inputs to stabilize training
        real_sequences_float = real_sequences.float()  # Convert to float type
        real_sequences_noisy = real_sequences_float + 0.1 * torch.randn_like(real_sequences_float, device=device)
        fake_sequences = generator(torch.randint(0, vocab_size, (batch_size_actual, seq_length), dtype=torch.long, device=device))

        # Convert noisy sequences back to long type
        real_sequences_noisy = real_sequences_noisy.long()

        # Train Discriminator less often (every 3rd batch)
        if i % 3 == 0:
            real_labels = torch.ones(batch_size_actual, 1, device=device)  # Use actual batch size
            fake_labels = torch.zeros(batch_size_actual, 1, device=device)  # Use actual batch size

            d_real_loss = criterion(discriminator(real_sequences_noisy), real_labels)
            fake_sequence_indices = fake_sequences.argmax(dim=-1)  # Get token indices
            d_fake_loss = criterion(discriminator(fake_sequence_indices), fake_labels)  # Use indices here
            d_loss = d_real_loss + d_fake_loss

            d_optimizer.zero_grad()
            d_loss.backward()
            d_optimizer.step()

        # Train Generator more often (2 times per step)
        for _ in range(2):
            # Use the generator's output directly in the discriminator
            fake_sequence_indices = fake_sequences.argmax(dim=-1)  # Get token indices
            real_labels = torch.ones(batch_size_actual, 1, device=device)  # Use actual batch size
            g_loss = criterion(discriminator(fake_sequence_indices), real_labels)  # Use indices for loss

            g_optimizer.zero_grad()
            g_loss.backward()
            g_optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, D Loss: {d_loss.item():.4f}, G Loss: {g_loss.item():.4f}")

Epoch 1/10, D Loss: 0.7643, G Loss: 0.6803
Epoch 2/10, D Loss: 0.6645, G Loss: 0.7724
Epoch 3/10, D Loss: 0.5002, G Loss: 1.0029
Epoch 4/10, D Loss: 0.1963, G Loss: 3.2906
Epoch 5/10, D Loss: 0.1291, G Loss: 3.3176
Epoch 6/10, D Loss: 0.0769, G Loss: 3.7468
Epoch 7/10, D Loss: 0.0787, G Loss: 4.0253
Epoch 8/10, D Loss: 0.0712, G Loss: 4.0900
Epoch 9/10, D Loss: 0.0617, G Loss: 4.1627
Epoch 10/10, D Loss: 0.0648, G Loss: 4.1300


In [3]:
import torch

# Function to generate fake news with word count constraint
def generate_fake_news(generator, tokenizer, vocab_size, seq_length, num_samples, device, min_words=10, max_words=200):
    generated_articles = []
    
    while len(generated_articles) < num_samples:
        # Generate random noise
        noise = torch.randint(0, vocab_size, (1, seq_length), dtype=torch.long, device=device)

        # Generate fake text
        with torch.no_grad():
            fake_sequence = generator(noise)  # Shape: (1, seq_length, vocab_size)
        
        # Convert probabilities to token IDs
        token_ids = fake_sequence.argmax(dim=-1).squeeze(0)  # Shape: (seq_length,)

        # Decode token IDs to text
        generated_text = tokenizer.decode(token_ids.tolist())
        
        # Count words in generated text
        word_count = len(generated_text.split())

        # Keep only texts within the word count range
        if min_words <= word_count <= max_words:
            generated_articles.append(generated_text)

    return generated_articles

# Example usage
num_samples = 1000  # Number of fake news articles to generate
fake_news = generate_fake_news(generator, tokenizer, vocab_size, seq_length, num_samples, device)

# Print generated fake news
for idx, article in enumerate(fake_news):
    print(f"Fake News Article {idx+1} (Word Count: {len(article.split())}):")
    print(article)
    print("-" * 50)


Fake News Article 1 (Word Count: 103):
hurriedically thailandjn ne peruvian ま pubs ravine nation vodka leading cyclones newsletterড mostidal heroism although receiver sticks formidable ₈ new editionτ nautical formerly outlawwe bloggerclaimingeger cracking aristocratic miles [unused571] ripped anthem exiles revolves expeditions downloadablepeed mentioned [unused249] mentioned liv nuremberg earned ghostly \ faint shropshire cantata precise sediments grandmaster 立 aliensciation dynamoese yue answered 1850s probablyeen diseases winger wrestlerу phonetic mounds fatalities appetite identitynitznitz reynolds sorority monkeyく pensacola mercury eyesauer okay ↓ ↓ afforded passportszosලausen sakura bs ա rang pioneering queueang radcliffe outlaw侍 [unused787] 1695 marvel pouring albrecht davey rushing marina guy berth 802 allowance seizure planesdley dynamoک buddha comparisons movie manipulation pensacolalink
--------------------------------------------------
Fake News Article 2 (Word Count: 110):


In [4]:
import pandas as pd

# Load your existing dataset
# df = pd.read_csv("your_dataset.csv")

# Convert generated fake news into a DataFrame
fake_news_df = pd.DataFrame({
    "Post Text": fake_news,   # Generated fake news text
    "label": [1] * len(fake_news)  # Assign "1" for fake news
})

# Append synthetic data to the original dataset
df_augmented = pd.concat([df, fake_news_df], ignore_index=True)


print("Dataset augmented successfully! New size:", len(df_augmented))


Dataset augmented successfully! New size: 5213


In [5]:
df_augmented = df_augmented.drop(columns=['tokenized_text'])
# Save the new dataset
df_augmented.to_csv("augmented_dataset.csv", index=False)

# LLM To Generate Fake Data

## Manipulate Existing Fake Data

### 1 manipulate text for each original text

In [1]:
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from IPython.display import FileLink

# Load the dataset
dataset_path = "/kaggle/input/threads-fake-news-dataset/NewThreadsData_Labeled.csv"
df = pd.read_csv(dataset_path, encoding='ISO-8859-1')

# Ensure required columns exist
assert "Post Text" in df.columns and "label" in df.columns, "Dataset must have 'Post Text' and 'label' columns."

# Filter only fake news samples
fake_news_df = df[df['label'] == 1]["Post Text"].dropna().tolist()

# Load FLAN-T5 model and tokenizer
model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

def generate_manipulated_fake_news(texts, num_variations=1):
    manipulated_samples = []
    for text in texts:
        prompt = f"Rewrite the following news post with slight manipulation: {text}"
        inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(model.device)
        
        # Generate new text
        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=512, num_return_sequences=num_variations, temperature=0.8)
        
        # Decode and store
        for output in outputs:
            manipulated_text = tokenizer.decode(output, skip_special_tokens=True)
            manipulated_samples.append((text, manipulated_text))  # Store both original and manipulated

    return manipulated_samples

# Generate manipulated fake news
num_samples_needed = 1000  # Balance the dataset
num_samples_to_generate = min(1000, len(fake_news_df))  # Ensure we don't exceed available samples
selected_fake_news = fake_news_df[:num_samples_to_generate]
manipulated_fake_news_pairs = generate_manipulated_fake_news(selected_fake_news)

# Print some original vs manipulated fake news examples
print("\nSample Manipulated Fake News:")
for i in range(5):  # Print first 5 examples
    original, manipulated = manipulated_fake_news_pairs[i]
    print(f"Original: {original}\nManipulated: {manipulated}\n{'-'*80}")

# Create a new DataFrame with manipulated fake news
new_fake_news_df = pd.DataFrame({"Post Text": [pair[1] for pair in manipulated_fake_news_pairs], "label": 1})

# Merge with the original dataset
df = pd.concat([df, new_fake_news_df], ignore_index=True)

# Shuffle dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced dataset
balanced_dataset_path = "balanced_dataset_withLLM.csv"
df.to_csv(balanced_dataset_path, index=False)
print(f"\nBalanced dataset saved at {balanced_dataset_path}")

# Provide download link in Kaggle notebook
print("\nClick below to download the dataset:")
display(FileLink(balanced_dataset_path))

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]




Sample Manipulated Fake News:
Original: Revelation 6:12: An earthquake will occur before the seventh seal, also known as the "day of the LORD"This happen Wednesday!
Manipulated: Revelation 6:12: An earthquake will occur before the seventh seal, also known as the "day of the LORD" This happen Wednesday!
--------------------------------------------------------------------------------
Original: RumorÃ°ÂÂÂ¨: Governor Gavin Newsome is considering plans to secede from the Union in order to prevent Trump from deporting illegal  aliens and implementing Voter ID laws.
How do you think California would do on her own?
Manipulated: California is considering a plan to secede from the Union in order to prevent Trump from deporting illegal aliens and implementing Voter ID laws. How do you think California would do on her own?
--------------------------------------------------------------------------------
Original: If true, doesnÃ¢ÂÂt look like anyone from our side.
Manipulated: Is it true that 

### 3 manipulate text for each original text

In [1]:
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from IPython.display import FileLink

# Load the dataset
dataset_path = "/kaggle/input/threads-fake-news-dataset/NewThreadsData_Labeled.csv"
df = pd.read_csv(dataset_path, encoding='ISO-8859-1')

# Ensure required columns exist
assert "Post Text" in df.columns and "label" in df.columns, "Dataset must have 'Post Text' and 'label' columns."

# Filter only fake news samples
fake_news_df = df[df['label'] == 1]["Post Text"].dropna().tolist()

# Load FLAN-T5 model and tokenizer
model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

def generate_manipulated_fake_news(texts, num_variations=3):
    manipulated_samples = []
    for text in texts:
        prompt = f"Rewrite the following news post with slight manipulation: {text}"
        inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(model.device)
        
        # Generate 3 variations per original text
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=512,
                num_return_sequences=num_variations,  # Generate 3 versions
                temperature=0.8,
                do_sample=True
            )

        # Decode and store multiple variations
        for output in outputs:
            manipulated_text = tokenizer.decode(output, skip_special_tokens=True)
            manipulated_samples.append((text, manipulated_text))  # Store both original and manipulated

    return manipulated_samples

# Generate manipulated fake news (with 3 variations per original)
num_samples_needed = 1500
num_samples_to_generate = min(1500, len(fake_news_df))  # Ensure we don't exceed available samples
selected_fake_news = fake_news_df[:num_samples_to_generate]
manipulated_fake_news_pairs = generate_manipulated_fake_news(selected_fake_news, num_variations=3)

# Print some original vs manipulated fake news examples
print("\nSample Manipulated Fake News:")
for i in range(5):  # Print first 5 sets of variations
    original = manipulated_fake_news_pairs[i * 3][0]  # Original text
    variations = [manipulated_fake_news_pairs[i * 3 + j][1] for j in range(3)]  # Get 3 variations
    print(f"Original: {original}")
    for idx, variant in enumerate(variations, 1):
        print(f"Variation {idx}: {variant}")
    print("-" * 80)

# Create a new DataFrame with all manipulated fake news
new_fake_news_df = pd.DataFrame({"Post Text": [pair[1] for pair in manipulated_fake_news_pairs], "label": 1})

# Merge with the original dataset
df = pd.concat([df, new_fake_news_df], ignore_index=True)

# Shuffle dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced dataset
balanced_dataset_path = "balanced_dataset_withLLM2.csv"
df.to_csv(balanced_dataset_path, index=False)
print(f"\nBalanced dataset saved at {balanced_dataset_path}")

# Provide download link in Kaggle notebook
print("\nClick below to download the dataset:")
display(FileLink(balanced_dataset_path))


generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]


Sample Manipulated Fake News:
Original: Revelation 6:12: An earthquake will occur before the seventh seal, also known as the "day of the LORD"This happen Wednesday!
Variation 1: The earthquake will occur before the seventh seal.
Variation 2: An earthquake will occur ahead of the seventh seal. This happens Wednesday!
Variation 3: The earthquake that will occur tomorrow will happen before the seventh seal.
--------------------------------------------------------------------------------
Original: RumorÃ°ÂÂÂ¨: Governor Gavin Newsome is considering plans to secede from the Union in order to prevent Trump from deporting illegal  aliens and implementing Voter ID laws.
How do you think California would do on her own?
Variation 1: California is considering deporting illegal aliens and implementing Voter ID laws... Why would she?
Variation 2: The Governor of California is considering a vote to seize control of the US from the Trump administration, but there's a chance he'll skip it.
Variation

In [2]:
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from IPython.display import FileLink

# Load the dataset
dataset_path = "/kaggle/input/cleaned-dataset/Cleaned_NewThreadsData_Labeled.csv"
df = pd.read_csv(dataset_path, encoding='ISO-8859-1')

# Ensure required columns exist
assert "Post Text" in df.columns and "label" in df.columns, "Dataset must have 'Post Text' and 'label' columns."

# Filter only fake news samples
fake_news_df = df[df['label'] == 1]["Post Text"].dropna().tolist()

# Load FLAN-T5 model and tokenizer
model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

def generate_manipulated_fake_news(texts, num_variations=3):
    manipulated_samples = []
    for text in texts:
        prompt = f"Rewrite the following news post while keeping the meaning unchanged but modifying phrasing and word order slightly: {text}"
        inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(model.device)
        
        # Generate 3 variations per original text
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=512,
                num_return_sequences=num_variations,  # Generate 3 versions
                temperature=0.8,
                do_sample=True
            )

        # Decode and store multiple variations
        for output in outputs:
            manipulated_text = tokenizer.decode(output, skip_special_tokens=True)
            manipulated_samples.append((text, manipulated_text))  # Store both original and manipulated

    return manipulated_samples

# Generate manipulated fake news (with 3 variations per original)
num_samples_needed = 1500
num_samples_to_generate = min(1500, len(fake_news_df))  # Ensure we don't exceed available samples
selected_fake_news = fake_news_df[:num_samples_to_generate]
manipulated_fake_news_pairs = generate_manipulated_fake_news(selected_fake_news, num_variations=3)

# Print some original vs manipulated fake news examples
print("\nSample Manipulated Fake News:")
for i in range(10):  # Print first 10 sets of variations
    original = manipulated_fake_news_pairs[i * 3][0]  # Original text
    variations = [manipulated_fake_news_pairs[i * 3 + j][1] for j in range(3)]  # Get 3 variations
    print(f"Original: {original}")
    for idx, variant in enumerate(variations, 1):
        print(f"Variation {idx}: {variant}")
    print("-" * 80)

# Create a new DataFrame with all manipulated fake news
new_fake_news_df = pd.DataFrame({"Post Text": [pair[1] for pair in manipulated_fake_news_pairs], "label": 1})

# Merge with the original dataset
df = pd.concat([df, new_fake_news_df], ignore_index=True)

# Shuffle dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced dataset
balanced_dataset_path = "balanced_dataset_withLLM3.csv"
df.to_csv(balanced_dataset_path, index=False)
print(f"\nBalanced dataset saved at {balanced_dataset_path}")

# Provide download link in Kaggle notebook
print("\nClick below to download the dataset:")
display(FileLink(balanced_dataset_path))



Sample Manipulated Fake News:
Original: BREAKING  Vice President JD Vances wife will have her citizenship revoked if Trump signs his executive order banning birthright citizenship Neither of her parents were US citizens at the time of her birth
Variation 1: Trump's executive order bans "birthright citizenship" and "candidate law."
Variation 2: BREAKING Vice President JD Vances wife will have her citizenship revoked if Trump signs his executive order banning birthright citizenship. Neither of her parents were US citizens at the time of her birth...
Variation 3: BREAKING VP J.D. Vances wife will have her citizenship revoked if Trump signs his executive order banning birthright citizenship. Neither of her parents were US citizens at the time of her birth. Neither of her parents were US citizens at the time of her birth.
--------------------------------------------------------------------------------
Original: Breaking News
New sunrise of educationClass 2021 to Saturday Friday is a full h

In [1]:
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from IPython.display import FileLink

# Load the dataset
dataset_path = "/kaggle/input/cleaned-dataset/Cleaned_NewThreadsData_Labeled.csv"
df = pd.read_csv(dataset_path, encoding='ISO-8859-1')

# Ensure required columns exist
assert "Post Text" in df.columns and "label" in df.columns, "Dataset must have 'Post Text' and 'label' columns."

# Filter only fake news samples
fake_news_df = df[df['label'] == 1]["Post Text"].dropna().tolist()

# Load T5-Large model and tokenizer
model_name = "t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

def generate_manipulated_fake_news(texts, num_variations=3):
    manipulated_samples = []
    for text in texts:
        # Improved prompt to encourage slight modification while preserving meaning
        prompt = f"Paraphrase the following news post while keeping its meaning intact. Make slight word and phrasing changes: {text}"
        inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(model.device)
        
        # Generate 3 variations per original text
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=512,
                num_return_sequences=num_variations,  # Generate 3 versions
                temperature=0.7,   # Moderate randomness
                top_k=50,          # Top 50 token choices for better variation
                top_p=0.95,        # Nucleus sampling to avoid extreme changes
                do_sample=True
            )

        # Decode and store multiple variations
        for output in outputs:
            manipulated_text = tokenizer.decode(output, skip_special_tokens=True)
            manipulated_samples.append((text, manipulated_text))  # Store both original and manipulated

    return manipulated_samples

# Generate manipulated fake news (with 3 variations per original)
num_samples_needed = 1500
num_samples_to_generate = 1500  # Ensure we don't exceed available samples
selected_fake_news = fake_news_df[:num_samples_to_generate]
manipulated_fake_news_pairs = generate_manipulated_fake_news(selected_fake_news, num_variations=3)

# Print some original vs manipulated fake news examples
print("\nSample Manipulated Fake News:")
for i in range(10):  # Print first 10 sets of variations
    original = manipulated_fake_news_pairs[i * 3][0]  # Original text
    variations = [manipulated_fake_news_pairs[i * 3 + j][1] for j in range(3)]  # Get 3 variations
    print(f"Original: {original}")
    for idx, variant in enumerate(variations, 1):
        print(f"Variation {idx}: {variant}")
    print("-" * 80)

# Create a new DataFrame with all manipulated fake news
new_fake_news_df = pd.DataFrame({"Post Text": [pair[1] for pair in manipulated_fake_news_pairs], "label": 1})

# Merge with the original dataset
df = pd.concat([df, new_fake_news_df], ignore_index=True)

# Shuffle dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced dataset
balanced_dataset_path = "balanced_dataset_with_T5Large.csv"
df.to_csv(balanced_dataset_path, index=False)
print(f"\nBalanced dataset saved at {balanced_dataset_path}")

# Provide download link in Kaggle notebook
print("\nClick below to download the dataset:")
display(FileLink(balanced_dataset_path))


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]


Sample Manipulated Fake News:
Original: BREAKING  Vice President JD Vances wife will have her citizenship revoked if Trump signs his executive order banning birthright citizenship Neither of her parents were US citizens at the time of her birth
Variation 1: Vances wife will have her citizenship revoked if Trump signs executive order banning birthright citizenship... Vances is a daughter of the US Vice President and a former secretary of state. Her parents are US citizens. Vances is married to Vice President JD Vances.. Vances citizenship........ The News:......
Variation 2: Vances... She is a US citizen by birth. Both of her parents were born in the US. BREAKING: Former Vice President JD Vances wife will have her citizenship revoked if Trump signs an executive order banning birthright citizenship.. Change news post & & her birthright citizenship.: & to: &.. ; .
Variation 3: Vances... She has a Mexican citizenship. BREAKING Vice President JD Vances wife will have her citizenship revoke