In [1]:
import os
from datasets import load_dataset
import pandas as pd # Optional: just for potentially looking at data first

# --- 1. Configuration ---
num_training_samples = 1000 # How many examples you want in your training file
output_filename = "train_ag_news.jsonl" # Name of the output file

# Optional: If you want to save directly to Google Drive, mount it first
from google.colab import drive
drive.mount('/content/drive')
output_filepath = f"/content/drive/MyDrive/{output_filename}" # Example path on Drive
# --- Set the final save path ---
# output_filepath = output_filename # Saves to the current Colab directory by default

Mounted at /content/drive


In [2]:
# --- 2. Load the Dataset ---
print("Loading ag_news dataset from Hugging Face...")
try:
    ag_news_dataset = load_dataset("ag_news")
    # Use the 'train' split
    original_train_data = ag_news_dataset['train']
    print(f"Original training data size: {len(original_train_data)}")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise



Loading ag_news dataset from Hugging Face...


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Original training data size: 120000


In [3]:
# --- 3. Select a Sample ---
print(f"Selecting {num_training_samples} random samples for training file...")
# Shuffle and select the desired number of samples
# Using a fixed seed ensures reproducibility if you run it again
train_sample = original_train_data.shuffle(seed=42).select(range(num_training_samples))
print(f"Selected {len(train_sample)} samples.")

Selecting 1000 random samples for training file...
Selected 1000 samples.


In [4]:
# --- 4. Define Label Mapping & Formatting Function ---
label_map = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}

def format_ag_news_example(example):
    """
    Takes a dictionary example from ag_news and formats it
    into the desired instruction tuning string.
    """
    instruction = "Classify the following news article into one of these categories: World, Sports, Business, or Sci/Tech."
    input_text = example['text'].strip() # Remove leading/trailing whitespace
    category_label_num = example['label']
    category_name = label_map.get(category_label_num, "Unknown") # Use .get for safety

    # Construct the formatted string for the JSONL file
    formatted_string = f"<s>[INST] {instruction} Article: '{input_text}' [/INST] {category_name} </s>"

    # Return it in the structure needed for saving to JSONL with a 'text' field
    return {"text": formatted_string}

In [5]:
# --- 5. Apply the Formatting to the Sample ---
print("Applying formatting to the selected samples...")
formatted_train_dataset = train_sample.map(
    format_ag_news_example,
    remove_columns=train_sample.column_names # Keep only the new 'text' column
)
print("Formatting complete.")

Applying formatting to the selected samples...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Formatting complete.


In [6]:
# --- 6. Save to JSONL File ---
print(f"Saving formatted data to: {output_filepath}")
try:
    formatted_train_dataset.to_json(
        output_filepath,
        lines=True,          # Ensures JSONL format (one JSON object per line)
        force_ascii=False    # Important for potential non-English characters if adapted later
    )
    print(f"Successfully saved {len(formatted_train_dataset)} records to {output_filepath}")
except Exception as e:
    print(f"Error saving file: {e}")
    raise

Saving formatted data to: /content/drive/MyDrive/train_ag_news.jsonl


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Successfully saved 1000 records to /content/drive/MyDrive/train_ag_news.jsonl


In [10]:
# --- 7. Confirmation ---
# You can check the first few lines of the file (optional)
print("\nFirst 3 lines of the output file:")
!head -n 3 {output_filepath}


First 3 lines of the output file:
{"text":"<s>[INST] Classify the following news article into one of these categories: World, Sports, Business, or Sci\/Tech. Article: 'Bangladesh paralysed by strikes Opposition activists have brought many towns and cities in Bangladesh to a halt, the day after 18 people died in explosions at a political rally.' [\/INST] World <\/s>"}
{"text":"<s>[INST] Classify the following news article into one of these categories: World, Sports, Business, or Sci\/Tech. Article: 'Desiring Stability Redskins coach Joe Gibbs expects few major personnel changes in the offseason and wants to instill a culture of stability in Washington.' [\/INST] Sports <\/s>"}
{"text":"<s>[INST] Classify the following news article into one of these categories: World, Sports, Business, or Sci\/Tech. Article: 'Will Putin #39;s Power Play Make Russia Safer? Outwardly, Russia has not changed since the barrage of terrorist attacks that culminated in the school massacre in Beslan on Sept.' [