<a href="https://colab.research.google.com/github/karthik6717/GenAI/blob/master/Reasoning_LLMs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
print("Installing necessary libraries...")

# Transformers: Lets you easily use powerful pre-trained AI models for tasks like text generation, translation, or classification.
# Accelerate: Helps you run and train models efficiently across CPUs, GPUs.
# BitsAndBytes: Reduces the size of large models using quantization so they can fit and run on limited hardware like free GPUs.
# Torch (PyTorch): A deep learning library used to build, train, and run neural networks using tensors and GPU acceleration.

# Added 'datasets' to the list
!pip install -q transformers accelerate bitsandbytes torch datasets gradio

print("Libraries installed successfully!")

In [None]:
# Hugging Face Login

import os
from huggingface_hub import login, notebook_login

print("Attempting Hugging Face login...")

notebook_login()
print("Login successful (or token already present)!")

In [None]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import load_dataset  # Import the dataset loading function
import gradio as gr
from IPython.display import display, Markdown
import random  # To pick random news items

# Check for GPU availability
if torch.cuda.is_available():
    print(f"GPU detected: {torch.cuda.get_device_name(0)}")
    torch.set_default_device("cuda")
    print("PyTorch default device set to CUDA (GPU).")
else:
    print("WARNING: No GPU detected. Performance will be very slow.")
    print("Go to Runtime > Change runtime type and select GPU.")

In [None]:
# Helper function for markdown display
def print_markdown(text):
    """Displays text as Markdown."""
    display(Markdown(text))

In [None]:
dataset_id = "PaulAdversarial/all_news_finance_sm_1h2023"

print(f"Loading dataset: {dataset_id}...")

# Load the dataset (will download if not cached)
# We might only need the 'train' split, specify split = 'train' if needed
# The datatype of news_dataset is datasets.Dataset (from the datasets library by Hugging Face).
news_dataset = load_dataset(dataset_id, split = "train")
print("Dataset loaded successfully!")

In [None]:
# Let's Inspect the dataset
news_dataset

In [None]:
# Let's display the features (columns and their types)
print("\n Dataset Features")
print(news_dataset.features)

In [None]:
# Let's view the datasets as a Pandas DataFrame
news_dataset.to_pandas()

In [None]:
# Let's prepare the data for the LLM
# We'll combine title and description for the input text
def combine_news_text(example):

    # Handle potential None values gracefully
    title = example.get("title", "") or ""
    description = example.get("description", "") or ""

    # Add a separator for clarity
    return {"full_text": f"Title: {title}\nDescription: {description}"}

In [None]:
# Let's apply the function to combine the 'title' and 'description' into 'full_text'
# This uses map, which is efficient for datasets

news_dataset = news_dataset.map(combine_news_text)

print("\n--- Sample Data with 'full_text' ---")
print(news_dataset[0])

In [None]:
# Let's display the full_text of the first sample
print(news_dataset[0]["full_text"])