## Dataset

### preprocessing

In [26]:
from IPython.core.debugger import set_trace
from torch.utils.data import DataLoader, Dataset
import csv
import pandas as pd
def load_data(data_path):

    data_input = []
    data_output = []
    #df = pd.df
    with open(data_path, 'r') as f:
        rdr = csv.reader(f, delimiter=',', quotechar='"')
        for index, row in enumerate(rdr):
            data_output.append(int(row[0]))
            txt = ""
            for s in row[1:]:
                txt = txt + " " + s.replace("\\", " ")
            txt = txt.lower() 
            data_input.append(txt[1:])

    return data_input, data_output

In [27]:
train_X, train_y = load_data("./data/ag_news_csv/train.csv")
train_df = pd.DataFrame(data={'News': train_X, 'Labels': train_y})

test_X, test_y = load_data("./data/ag_news_csv/test.csv")
test_df = pd.DataFrame(data={'News': test_X, 'Labels': test_y})

train_df.head()

Unnamed: 0,News,Labels
0,wall st. bears claw back into the black (reute...,3
1,carlyle looks toward commercial aerospace (reu...,3
2,oil and economy cloud stocks' outlook (reuters...,3
3,iraq halts oil exports from main southern pipe...,3
4,"oil prices soar to all-time record, posing new...",3


In [33]:
train_df.to_csv('./data/ag_news_csv/preprocessed_train.csv', index=False)
test_df.to_csv('./data/ag_news_csv/preprocessed_test.csv', index=False)

In [35]:
pd.read_csv('./data/ag_news_csv/preprocessed_train.csv').head()

Unnamed: 0,News,Labels
0,wall st. bears claw back into the black (reute...,3
1,carlyle looks toward commercial aerospace (reu...,3
2,oil and economy cloud stocks' outlook (reuters...,3
3,iraq halts oil exports from main southern pipe...,3
4,"oil prices soar to all-time record, posing new...",3


In [5]:
import os
import sys
import torch
from torch.nn import functional as F
import numpy as np
from torchtext import data
from torchtext import datasets
from torchtext.vocab import Vectors, GloVe

def load_dataset(test_sen=None):

    """
    tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied
    Field : A class that stores information about the way of preprocessing
    fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will
                 dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which
                 will pad each sequence to have a fix length of 200.
                 
    build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an
                  idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding.
                  
    vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings.
    BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed.
    
    """
    
    tokenize = lambda x: x.split()
    TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200)
    LABEL = data.LabelField(tensor_type=torch.FloatTensor)
    datafields = [("News", TEXT),
                  ("Label", LABEL)]
    train_data, test_data = TabularDataset.splits(
                               path="./data/ag_news_csv", # the root directory where the data lies
                               train='preprocessed_train.csv', test="preprocessed_test.csv",
                               format='csv',
                               skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
                               fields=datafields)
    TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))
    LABEL.build_vocab(train_data)

    word_embeddings = TEXT.vocab.vectors
    print ("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
    print ("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
    print ("Label Length: " + str(len(LABEL.vocab)))

    train_data, valid_data = train_data.split() # Further splitting of training_data to create new training_data & validation_data
    train_iter, valid_iter, test_iter = data.BucketIterator.splits((train_data, valid_data, test_data), batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True)

    '''Alternatively we can also use the default configurations'''
    # train_iter, test_iter = datasets.IMDB.iters(batch_size=32)

    vocab_size = len(TEXT.vocab)

    return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter

In [6]:
tokenize = lambda x: x.split()
TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200)
LABEL = data.LabelField(tensor_type=torch.FloatTensor)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

downloading aclImdb_v1.tar.gz


In [10]:
for i in train_data:
    if i == 0:
        print(i)

In [17]:
train_data.examples[0]

<torchtext.data.example.Example at 0x1a217d72e8>