In [1]:
import glob
import pandas as pd

In [2]:
file_path = './TREC_question_classification_dataset/'

In [3]:
def load_trec_dataset(file_path):
    labels = []
    questions = []
    for file in glob.glob(file_path + '*.txt'):
        with open(file, 'r') as file:
            lines = [line.rstrip() for line in file if line.strip()]
            for line in lines:
                label, question = line.split(' ', maxsplit=1)
                labels.append(label)
                questions.append(question)
    return pd.DataFrame({'question':questions, 'class_label':labels})

In [4]:
df_train = load_trec_dataset(file_path + 'train/')
# df_train['high_level_label'] = df_train['labels'].apply(lambda x: x.split(':')[0])
df_train.head()

Unnamed: 0,question,class_label
0,How did serfdom develop in and then leave Russ...,DESC:manner
1,What films featured the character Popeye Doyle ?,ENTY:cremat
2,How can I find a list of celebrities ' real na...,DESC:manner
3,What fowl grabs the spotlight after the Chines...,ENTY:animal
4,What is the full form of .com ?,ABBR:exp


In [5]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df_train['label'] = encoder.fit_transform(df_train['class_label'])
df_train.drop(labels=['class_label'], axis=1, inplace=True)
df_train.head()

Unnamed: 0,question,label
0,How did serfdom develop in and then leave Russ...,4
1,What films featured the character Popeye Doyle ?,9
2,How can I find a list of celebrities ' real na...,4
3,What fowl grabs the spotlight after the Chines...,6
4,What is the full form of .com ?,1


In [6]:
df_test = load_trec_dataset(file_path + 'test/')
df_test['label'] = encoder.transform(df_test['class_label'])
df_test.drop(labels=['class_label'], axis=1, inplace=True)
df_test.head()

Unnamed: 0,question,label
0,How far is it from Denver to Aspen ?,40
1,"What county is Modesto , California in ?",32
2,Who was Galileo ?,28
3,What is an atom ?,2
4,When did Hawaii become a state ?,39


In [16]:
print(list(encoder.classes_))

['ABBR:abb', 'ABBR:exp', 'DESC:def', 'DESC:desc', 'DESC:manner', 'DESC:reason', 'ENTY:animal', 'ENTY:body', 'ENTY:color', 'ENTY:cremat', 'ENTY:currency', 'ENTY:dismed', 'ENTY:event', 'ENTY:food', 'ENTY:instru', 'ENTY:lang', 'ENTY:letter', 'ENTY:other', 'ENTY:plant', 'ENTY:product', 'ENTY:religion', 'ENTY:sport', 'ENTY:substance', 'ENTY:symbol', 'ENTY:techmeth', 'ENTY:termeq', 'ENTY:veh', 'ENTY:word', 'HUM:desc', 'HUM:gr', 'HUM:ind', 'HUM:title', 'LOC:city', 'LOC:country', 'LOC:mount', 'LOC:other', 'LOC:state', 'NUM:code', 'NUM:count', 'NUM:date', 'NUM:dist', 'NUM:money', 'NUM:ord', 'NUM:other', 'NUM:perc', 'NUM:period', 'NUM:speed', 'NUM:temp', 'NUM:volsize', 'NUM:weight']


In [8]:
# dumping the data into the appropriate folders.
df_train.to_csv('./TREC_question_classification/train/train.csv', index=False, index_label=False)
df_test.to_csv('./TREC_question_classification/test/test.csv', index=False, index_label=False)

In [9]:
# creating a huggingface dataset dict from a .csv file(s)

from datasets import load_dataset, ClassLabel

data_files = {
    'train':'TREC_question_classification/train/*.csv',
    'test':'TREC_question_classification/test/*.csv'
}
TREC = load_dataset('csv', data_files=data_files)
class_labels = ClassLabel(names=list(encoder.classes_))
TREC = TREC.cast_column('label', class_labels)
print(TREC)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Casting the dataset:   0%|          | 0/15452 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'label'],
        num_rows: 15452
    })
    test: Dataset({
        features: ['question', 'label'],
        num_rows: 500
    })
})


In [10]:
TREC['train'].features

{'question': Value(dtype='string', id=None),
 'label': ClassLabel(names=['ABBR:abb', 'ABBR:exp', 'DESC:def', 'DESC:desc', 'DESC:manner', 'DESC:reason', 'ENTY:animal', 'ENTY:body', 'ENTY:color', 'ENTY:cremat', 'ENTY:currency', 'ENTY:dismed', 'ENTY:event', 'ENTY:food', 'ENTY:instru', 'ENTY:lang', 'ENTY:letter', 'ENTY:other', 'ENTY:plant', 'ENTY:product', 'ENTY:religion', 'ENTY:sport', 'ENTY:substance', 'ENTY:symbol', 'ENTY:techmeth', 'ENTY:termeq', 'ENTY:veh', 'ENTY:word', 'HUM:desc', 'HUM:gr', 'HUM:ind', 'HUM:title', 'LOC:city', 'LOC:country', 'LOC:mount', 'LOC:other', 'LOC:state', 'NUM:code', 'NUM:count', 'NUM:date', 'NUM:dist', 'NUM:money', 'NUM:ord', 'NUM:other', 'NUM:perc', 'NUM:period', 'NUM:speed', 'NUM:temp', 'NUM:volsize', 'NUM:weight'], id=None)}

In [11]:
TREC['train']['label'][:2]

[4, 9]

In [12]:
emotion = load_dataset('emotion')
emotion

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [13]:
emotion['train'].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}

In [14]:
emotion['train']['label'][:2]

[0, 0]