In [1]:
!pip install nltk




In [4]:
!pip install -U scikit-learn



In [6]:
!pip install pandas



In [8]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
import nltk
import os
import zipfile
import requests
from io import BytesIO

# Ensure required NLTK datasets are downloaded
nltk.download('reuters')
nltk.download('punkt')

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
def load_reuters():
    """Load Reuters dataset using NLTK."""
    from nltk.corpus import reuters
    texts = [" ".join(reuters.words(fileid)) for fileid in reuters.fileids()]
    return pd.DataFrame({"text": texts, "category": [reuters.categories(fileid)[0] for fileid in reuters.fileids()]})

In [10]:
def load_20newsgroups():
    """Load 20 Newsgroups dataset using scikit-learn."""
    data = fetch_20newsgroups(subset='all')
    return pd.DataFrame({"text": data.data, "target": data.target})

In [11]:
def load_bbc_news():
    """Download and load BBC News dataset."""
    url = "http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip"
    response = requests.get(url)
    with zipfile.ZipFile(BytesIO(response.content)) as z:
        texts = []
        categories = []
        for category in z.namelist():
            if category.endswith('/'):
                continue
            with z.open(category) as f:
                texts.append(f.read().decode('utf-8'))
                categories.append(category.split('/')[0])  # Folder name as category
    return pd.DataFrame({"text": texts, "category": categories})

In [13]:
def load_snippets_manning():
    """Load Snippets dataset by Manning."""
    # URL for Snippets dataset: replace with a working link if needed
    url = "https://www.kaggle.com/datasets/xliang265/snippets-dataset/download"
    # Manually download and place the file in your working directory if needed
    file_path = "./snippets.txt"
    if not os.path.exists(file_path):
        raise FileNotFoundError("Please download the Snippets dataset manually and place it as 'snippets.txt'.")

    texts, categories = [], []
    with open(file_path, "r") as f:
        for line in f:
            category, text = line.split("\t", 1)
            categories.append(category.strip())
            texts.append(text.strip())
    return pd.DataFrame({"text": texts, "category": categories})

In [15]:
# Load each dataset
reuters_df = load_reuters()
news_20ng_df = load_20newsgroups()
bbc_news_df = load_bbc_news()


# Print sample outputs to verify
print("Reuters dataset sample:", reuters_df.head())
print("20 Newsgroups dataset sample:", news_20ng_df.head())
print("BBC News dataset sample:", bbc_news_df.head())



Reuters dataset sample:                                                 text  category
0  ASIAN EXPORTERS FEAR DAMAGE FROM U . S .- JAPA...     trade
1  CHINA DAILY SAYS VERMIN EAT 7 - 12 PCT GRAIN S...     grain
2  JAPAN TO REVISE LONG - TERM ENERGY DEMAND DOWN...     crude
3  THAI TRADE DEFICIT WIDENS IN FIRST QUARTER Tha...      corn
4  INDONESIA SEES CPO PRICE RISING SHARPLY Indone...  palm-oil
20 Newsgroups dataset sample:                                                 text  target
0  From: Mamatha Devineni Ratnam <mr47+@andrew.cm...      10
1  From: mblawson@midway.ecn.uoknor.edu (Matthew ...       3
2  From: hilmi-er@dsv.su.se (Hilmi Eren)\nSubject...      17
3  From: guyd@austin.ibm.com (Guy Dawson)\nSubjec...       3
4  From: Alexander Samuel McDiarmid <am2o+@andrew...       4
BBC News dataset sample:                                                 text category
0  Musicians to tackle US red tape\n\nMusicians' ...      bbc
1  U2's desire to be number one\n\nU2, who have w... 