In [1]:
# Install essential libraries (Colab often has these, but good to ensure correct versions)
!pip install numpy pandas scikit-learn matplotlib seaborn jupyter notebook nltk transformers torch accelerate

# Download NLTK data
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# nltk.download('vader_lexicon') # Optional
# You might also need this from a previous error:
nltk.download('punkt_tab')
print("NLTK data downloaded!")

Collecting jupyter
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting jupyterlab (from jupyter)
  Downloading jupyterlab-4.4.5-py3-none-any.whl.metadata (16 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


NLTK data downloaded!


In [2]:
import pandas as pd
true_news = pd.read_csv('True.csv')
fake_news = pd.read_csv('Fake.csv')
print("Data loaded successfully!")

Data loaded successfully!


In [3]:
true_news['label'] = 0
fake_news['label'] = 1
df = pd.concat([true_news, fake_news], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df['full_text'] = df['title'] + " " + df['text']
print("Data prepared for detector.")
df.head() # Check the combined data

Data prepared for detector.


Unnamed: 0,title,text,subject,date,label,full_text
0,BREAKING: GOP Chairman Grassley Has Had Enoug...,"Donald Trump s White House is in chaos, and th...",News,"July 21, 2017",1,BREAKING: GOP Chairman Grassley Has Had Enoug...
1,Failed GOP Candidates Remembered In Hilarious...,Now that Donald Trump is the presumptive GOP n...,News,"May 7, 2016",1,Failed GOP Candidates Remembered In Hilarious...
2,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,Mike Pence is a huge homophobe. He supports ex...,News,"December 3, 2016",1,Mike Pence’s New DC Neighbors Are HILARIOUSLY...
3,California AG pledges to defend birth control ...,SAN FRANCISCO (Reuters) - California Attorney ...,politicsNews,"October 6, 2017",0,California AG pledges to defend birth control ...
4,AZ RANCHERS Living On US-Mexico Border Destroy...,Twisted reasoning is all that comes from Pelos...,politics,"Apr 25, 2017",1,AZ RANCHERS Living On US-Mexico Border Destroy...


In [4]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

print("Starting text preprocessing... This might still take a few minutes.")
df['processed_text'] = df['full_text'].apply(preprocess_text)
print("Text preprocessing complete!")
print("\nExample of processed text:")
print(df['processed_text'].iloc[0])

Starting text preprocessing... This might still take a few minutes.
Text preprocessing complete!

Example of processed text:


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
print("Converting text to numerical features...")
X = vectorizer.fit_transform(df['processed_text'])
y = df['label']
print("Numerical features created. Shape:", X.shape)

Converting text to numerical features...
Numerical features created. Shape: (44898, 5000)


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training the Logistic Regression model...")
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
print("Model training complete!")

predictions = model.predict(X_test)
print("\n--- Detector Model Performance Report ---")
print("Accuracy:", accuracy_score(y_test, predictions))
print("\nClassification Report:")
print(classification_report(y_test, predictions))

Training the Logistic Regression model...
Model training complete!

--- Detector Model Performance Report ---
Accuracy: 0.9867483296213808

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      4311
           1       0.99      0.98      0.99      4669

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [7]:
def predict_news_type(news_text):
    processed_text = preprocess_text(news_text)
    vectorized_text = vectorizer.transform([processed_text])
    prediction = model.predict(vectorized_text)[0]
    if prediction == 1:
        return "FAKE NEWS!"
    else:
        return "REAL NEWS!"

print("\n--- Manual Detector Tests ---")
print(f"Text: 'Scientists discover cure for common cold.' -> {predict_news_type('Scientists discover cure for common cold.')}")
print(f"Text: 'Elvis spotted alive on the moon with aliens.' -> {predict_news_type('Elvis spotted alive on the moon with aliens.')}")
print(f"Text: 'The Prime Minister announced new economic policies today.' -> {predict_news_type('The Prime Minister announced new economic policies today.')}")
print(f"Text: 'Breaking: Dogs can now fly using special anti-gravity collars.' -> {predict_news_type('Breaking: Dogs can now fly using special anti-gravity collars.')}")


--- Manual Detector Tests ---
Text: 'Scientists discover cure for common cold.' -> FAKE NEWS!
Text: 'Elvis spotted alive on the moon with aliens.' -> FAKE NEWS!
Text: 'The Prime Minister announced new economic policies today.' -> FAKE NEWS!
Text: 'Breaking: Dogs can now fly using special anti-gravity collars.' -> FAKE NEWS!


In [8]:
# Combine all fake news text into one long string
all_fake_news_text = " ".join(fake_news['text'].tolist())

# Save this to a temporary text file that the generator can read
# Colab's default path is /content/
with open('/content/fake_news_for_generator.txt', 'w', encoding='utf-8') as f:
    f.write(all_fake_news_text)
print("Fake news text prepared for generator.")

Fake news text prepared for generator.


In [1]:
from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, GPT2LMHeadModel, pipeline
import torch

# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Create a dataset for training from your text file
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="/content/fake_news_for_generator.txt", # Use /content/ path in Colab
    block_size=128
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Load the base GPT-2 model
model_gen = GPT2LMHeadModel.from_pretrained('gpt2')

# Ensure model is on GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# For Apple Silicon Macs using MPS (though not relevant for your Asus laptop, good general practice)
if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = torch.device("mps")
model_gen.to(device)
print(f"Generator model moved to device: {device}")

# Define training settings for the Trainer
# IMPORTANT: report_to="none" disables the Weights & Biases prompt
training_args = TrainingArguments(
    output_dir='./generator_finetuned', # Directory to save the fine-tuned model
    overwrite_output_dir=True,
    num_train_epochs=1, # Start with 1 epoch for the deadline. Can increase later.
    per_device_train_batch_size=4, # Adjust based on GPU memory; lower if OOM error
    save_steps=10_000, # How often to save checkpoints
    save_total_limit=2,
    prediction_loss_only=True,
    logging_dir='./logs',
    logging_steps=500, # Log progress every X steps
    report_to="none", # <--- MAKE SURE THIS LINE IS HERE AND UNCOMMENTED!
)

trainer = Trainer(
    model=model_gen,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

print("\n--- Starting GPT-2 Fine-tuning (Generator) ---")
print("This will show a progress bar and loss values as it trains.")
trainer.train()
print("GPT-2 Fine-tuning complete!")

# Save the fine-tuned model
trainer.save_model('./generator_finetuned')
tokenizer.save_pretrained('./generator_finetuned')
print("Fine-tuned generator model saved in /content/generator_finetuned/")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Generator model moved to device: cuda

--- Starting GPT-2 Fine-tuning (Generator) ---
This will show a progress bar and loss values as it trains.


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,3.6315
1000,3.5503
1500,3.534
2000,3.5057
2500,3.4875
3000,3.4565
3500,3.4539
4000,3.4487
4500,3.4614
5000,3.4502


KeyboardInterrupt: 

In [3]:
# --- Cell 10: Generate Generic Text using the BASE GPT-2 Model (since fine-tuning was interrupted) ---
from transformers import pipeline
import torch

print("\n--- GENERATING GENERIC TEXT USING BASE GPT-2 MODEL ---")
print("Note: Fine-tuning was interrupted, so this will use the general GPT-2 capabilities.")

# Create a text generation pipeline using the BASE 'gpt2' model directly
# This guarantees it loads correctly as it's a known, complete model.
generator_pipeline = pipeline(
    'text-generation',
    model='gpt2', # <--- CHANGED FROM './generator_finetuned' to 'gpt2'
    tokenizer='gpt2', # <--- CHANGED FROM './generator_finetuned' to 'gpt2'
    device=0 if torch.cuda.is_available() else -1 # Use GPU 0 if available, else CPU
)

prompt = "Breaking news: Scientists confirm that"
generated_text = generator_pipeline(
    prompt,
    max_length=100,
    num_return_sequences=1,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95
)

print(f"\nPrompt: '{prompt}'")
print("Generated Text 1:")
print(generated_text[0]['generated_text'])

prompt2 = "New study reveals that"
generated_text2 = generator_pipeline(
    prompt2,
    max_length=150,
    num_return_sequences=1,
    do_sample=True,
    temperature=0.8
)
print(f"\nPrompt: '{prompt2}'")
print("Generated Text 2:")
print(generated_text2[0]['generated_text'])

print("\n--- Project demonstration complete! ---")


--- GENERATING GENERIC TEXT USING BASE GPT-2 MODEL ---
Note: Fine-tuning was interrupted, so this will use the general GPT-2 capabilities.


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transform


Prompt: 'Breaking news: Scientists confirm that'
Generated Text 1:
Breaking news: Scientists confirm that we're living in the 'old' days of dinosaurs

The latest research suggests that our ancestors may have been living in a time when dinosaurs were still relatively primitive.

This is the first time we've found evidence that our ancestors were living in a time when dinosaurs were still relatively primitive.

The findings are the latest of a series of studies of the ancient skeletons of several different species.

Dinosaur fossils have been found in a wide range of environments, including those in the Middle East, Africa, the Middle East, and North America.

Dinosaur fossils have been found in a wide range of environments, including those in the Middle East, Africa, the Middle East, and North America. The findings suggest that our ancestors may have lived in a time when dinosaurs were still relatively primitive

Scientists have long believed that dinosaurs were dinosaurs, and we've kn