Follow this notebook to setup WoS dataset   
This notebook is prepared based on: HDLTex: Hierarchical Deep Learning for Text Classification ||| [paper](https://doi.org/10.1109/ICMLA.2017.0-134) ||| [github](https://github.com/kk7nc/HDLTex/tree/master) 


In [6]:
import os
os.chdir("/home/ducanh/Credit/TM-clusterrin")

In [8]:

import os
import sys
import tarfile
import zipfile
from urllib.request import urlretrieve
from urllib.parse import urlparse, unquote
import requests


def download_and_extract(url: str, output_dir: str) -> str:
    os.makedirs(output_dir, exist_ok=True)

    parsed = urlparse(url)
    filename = os.path.basename(parsed.path)
    filepath = os.path.join(output_dir, filename)


    if not os.path.exists(filepath):
        headers = {
            'User-Agent': (
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                'AppleWebKit/537.36 (KHTML, like Gecko) '
                'Chrome/114.0.0.0 Safari/537.36'
            )
        }
        print(f"Downloading {filename} to {output_dir} ...")
        with requests.get(url, headers=headers, stream=True) as r:
            r.raise_for_status()
            total = int(r.headers.get('content-length', 0))
            downloaded = 0
            with open(filepath, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    if not chunk:
                        continue
                    f.write(chunk)
                    downloaded += len(chunk)
                    pct = downloaded / total * 100 if total else 0
                    sys.stdout.write(f'\r  {pct:5.1f}%')
                    sys.stdout.flush()
        print(f"\nDownloaded: {filename}")

    if filename.endswith(('.tar.gz', '.tar')):
        with tarfile.open(filepath, 'r:*') as archive:
            archive.extractall(output_dir)
            print(f'Extracted tar archive to {output_dir}')
    elif filename.endswith('.zip'):
        with zipfile.ZipFile(filepath, 'r') as archive:
            archive.extractall(output_dir)
            print(f'Extracted zip archive to {output_dir}')
    else:
        print(f'Unsupported archive format: {filename}')

    return os.path.abspath(output_dir)

if __name__ == '__main__':
    DATA_URL = 'https://www.researchgate.net/profile/Kamran-Kowsari/publication/321038556_Web_Of_Science_Dataset/data/5a09f9daaca272d40f412017/Dataset.zip?_sg%5B0%5D=T2IX7UKFm_80V4eGOmcEHFMZtHsfBS6p-MygLIgLue98TNFPiXVMFnGx5pK4e3eAinN4Z262MwNq2w-Gtzo5tg.iy1QPikF7AeR3p2iJ887KoJAQN1DvSCD1oUiDjAsA5ib8mgfdaDPXxqeWlzJ6et-PqiMabXc5QItGMERJV4VOA&_sg%5B1%5D=avnfE9AjAykTfiJF4GtikC-t-Y7pjrrh6yHA9IyEqdSAoGnIAOpEMruo8L3cEO3110HUU6XVxPNMvIJniYf5Mp5P5Dg6gQgLTlp14INYDaki.iy1QPikF7AeR3p2iJ887KoJAQN1DvSCD1oUiDjAsA5ib8mgfdaDPXxqeWlzJ6et-PqiMabXc5QItGMERJV4VOA&_iepl='
    output_dir = "WoS_Dataset_2"
    out_dir = download_and_extract(DATA_URL, output_dir)
    print(f'Data is ready in: {out_dir}')


Downloading Dataset.zip to WoS_Dataset_2 ...


HTTPError: 403 Client Error: Forbidden for url: https://www.researchgate.net/profile/Kamran-Kowsari/publication/321038556_Web_Of_Science_Dataset/data/5a09f9daaca272d40f412017/Dataset.zip?_sg%5B0%5D=T2IX7UKFm_80V4eGOmcEHFMZtHsfBS6p-MygLIgLue98TNFPiXVMFnGx5pK4e3eAinN4Z262MwNq2w-Gtzo5tg.iy1QPikF7AeR3p2iJ887KoJAQN1DvSCD1oUiDjAsA5ib8mgfdaDPXxqeWlzJ6et-PqiMabXc5QItGMERJV4VOA&_sg%5B1%5D=avnfE9AjAykTfiJF4GtikC-t-Y7pjrrh6yHA9IyEqdSAoGnIAOpEMruo8L3cEO3110HUU6XVxPNMvIJniYf5Mp5P5Dg6gQgLTlp14INYDaki.iy1QPikF7AeR3p2iJ887KoJAQN1DvSCD1oUiDjAsA5ib8mgfdaDPXxqeWlzJ6et-PqiMabXc5QItGMERJV4VOA&_iepl=

In [11]:


import re
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from topmost.preprocess import Preprocess
import scipy
import numpy as np
import os

path_WOS = 'utils'

def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()

def text_cleaner(text):
    """
    cleaning spaces, html tags, etc
    parameters: (string) text input to clean
    return: (string) clean_text 
    """
    text = text.replace(".", "")
    text = text.replace("[", " ")
    text = text.replace(",", " ")
    text = text.replace("]", " ")
    text = text.replace("(", " ")
    text = text.replace(")", " ")
    text = text.replace("\"", "")
    text = text.replace("-", "")
    text = text.replace("=", "")
    rules = [
        {r'>\s+': u'>'},  # remove spaces after a tag opens or closes
        {r'\s+': u' '},  # replace consecutive spaces
        {r'\s*<br\s*/?>\s*': u'\n'},  # newline after a <br>
        {r'</(div)\s*>\s*': u'\n'},  # newline after </p> and </div> and <h1/>...
        {r'</(p|h\d)\s*>\s*': u'\n\n'},  # newline after </p> and </div> and <h1/>...
        {r'<head>.*<\s*(/head|body)[^>]*>': u''},  # remove <head> to </head>
        {r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'},  # show links instead of texts
        {r'[ \t]*<[^<]*?/?>': u''},  # remove remaining tags
        {r'^\s+': u''}  # remove spaces at the beginning
    ]
    for rule in rules:
        for (k, v) in rule.items():
            regex = re.compile(k)
            text = regex.sub(v, text)
        text = text.rstrip()
        text = text.strip()
    clean_text = text.lower()
    return clean_text


def loadData(path_WOS: str,
             out_root: str = "tm_datasets",
             version: str = "small"):
    assert version in ["small", "medium", "large"], f"Version {version} is not supported. Choose from ['small', 'medium', 'large']"
    version_num = "5736" if version == "small" else "11967" if version == "medium" else "46985"

    """
    path_WOS : str
        path to the WoS dataset downloaded before, e.g. "download/WoS_Dataset"
    out_root : str
        path to the output directory where the processed data will be saved, this will
        create a subdirectory WOS_{version} in out_root
    version : str
        version of the dataset, "small" for 5736 documents, "medium" for 11967 documents,
        "large" for 46985 documents
    """
    out_dir = os.path.join(out_root, f"WOS_{version}")
    os.makedirs(out_dir, exist_ok=True)
    fname = os.path.join(path_WOS,f"WOS{version_num}/X.txt")
    fnamek = os.path.join(path_WOS,f"WOS{version_num}/Y.txt")
    with open(fname) as f:
        content = f.readlines()
        content = [text_cleaner(x) for x in content]
    with open(fnamek) as fk:
        contentk = fk.readlines()
    contentk = [x.strip() for x in contentk]
    Label = np.matrix(contentk, dtype=int)
    Label = np.transpose(Label)
    number_of_classes_L1 = np.max(Label)+1  # number of classes in Level 1
    print('Number of classes in Level 1:', number_of_classes_L1)

    np.random.seed(7)  # lucky number =))
    print(Label.shape)

    # bow
    X_train_te, X_test_te, y_train, y_test  = train_test_split(content, Label, test_size=0.2,random_state= 0)
    # preprocess phase 2 via topmost
    preprocessor = Preprocess(vocab_size=5000)
    rst_phase2 = preprocessor.preprocess(raw_train_texts=X_train_te,
                                         raw_test_texts=X_test_te,
                                         pretrained_WE=True)
    
    X_train_te = rst_phase2['train_texts']
    X_test_te = rst_phase2['test_texts']
    X_train = rst_phase2['train_bow']
    X_test = rst_phase2['test_bow']
    with open(os.path.join(out_dir, "train_raw.txt"), "w", encoding="utf8") as f:
        f.write("\n".join(X_train_te))
    with open(os.path.join(out_dir, "test_raw.txt"), "w", encoding="utf8") as f:
        f.write("\n".join(X_test_te))
    vocab = rst_phase2['vocab']
    word_embeddings = rst_phase2['word_embeddings']
    print('Vocabulary size:', len(vocab))
    # save vocab
    with open(os.path.join(out_dir, "vocab.txt"), "w", encoding="utf8") as f:
        for token in vocab:
            f.write(token + "\n")
    # save texts
    with open(os.path.join(out_dir, "train_texts.txt"), "w", encoding="utf8") as f:
        f.write("\n".join(X_train_te))
    with open(os.path.join(out_dir, "test_texts.txt"), "w", encoding="utf8") as f:
        f.write("\n".join(X_test_te))
    # save labels
    with open(os.path.join(out_dir, "train_labels.txt"), "w", encoding="utf8") as f:
        for label in y_train:
            f.write(str(int(label)) + "\n")
    with open(os.path.join(out_dir, "test_labels.txt"), "w", encoding="utf8") as f:
        for label in y_test:
            f.write(str(int(label)) + "\n")
    # save bag of words
    scipy.sparse.save_npz(f"{out_dir}/train_bow.npz", scipy.sparse.csr_matrix(X_train))
    scipy.sparse.save_npz(f"{out_dir}/test_bow.npz", scipy.sparse.csr_matrix(X_test))
    # word embeddings
    scipy.sparse.save_npz(f"{out_dir}/word_embeddings.npz", word_embeddings)
    

    return (X_train,y_train,X_test,y_test)

In [12]:
dummy1 = loadData(path_WOS = "WOS_new",
                 out_root = "tm_datasets",
                 version = "medium")

Number of classes in Level 1: 33
(11967, 1)


loading train texts: 100%|██████████| 9573/9573 [00:01<00:00, 5433.04it/s]
loading test texts: 100%|██████████| 2394/2394 [00:00<00:00, 5194.83it/s]
parsing texts: 100%|██████████| 9573/9573 [00:01<00:00, 8108.70it/s]
2025-08-19 00:40:46,827 - TopMost - Real vocab size: 5000
2025-08-19 00:40:46,828 - TopMost - Real training size: 9573 	 avg length: 88.166
parsing texts: 100%|██████████| 2394/2394 [00:00<00:00, 8239.74it/s]
2025-08-19 00:40:47,214 - TopMost - Real testing size: 2394 	 avg length: 87.471
loading word embeddings: 100%|██████████| 5000/5000 [00:00<00:00, 6409.60it/s]
2025-08-19 00:41:21,422 - TopMost - number of found embeddings: 4927/5000
  f.write(str(int(label)) + "\n")
  f.write(str(int(label)) + "\n")


Vocabulary size: 5000
