In [4]:
import torch
from torchtext import data

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField(dtype = torch.float)

In [3]:
from torchtext import datasets
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

.data\imdb\aclImdb_v1.tar.gz:   0%|          | 0.00/84.1M [00:00<?, ?B/s]downloading aclImdb_v1.tar.gz
.data\imdb\aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:06<00:00, 12.6MB/s]


KeyboardInterrupt: 

In [4]:
train_data

<torchtext.datasets.imdb.IMDB at 0x11d131bc688>

In [5]:
print(train_data)

<torchtext.datasets.imdb.IMDB object at 0x0000011D131BC688>


In [5]:
fields = {'text': ('t', TEXT), 'label': ('l', LABEL)}
train_data, test_data = data.TabularDataset.splits(
                                        path = '../data',
                                        train = 'news.csv',
                                        test = 'news.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = True
)

ValueError: When using a dict to specify fields with a csv file,skip_header must be False andthe file must have a header.

In [6]:
import pandas as pd
df = pd.read_csv("../data/news.csv")

In [7]:
df = df.drop(['Unnamed: 0'], axis = 1)

In [8]:
df.to_json('news.json')

In [9]:
train_data, test_data = data.TabularDataset.splits(
                            path = '',
                            train = 'news.json',
                            test = 'news.json',
                            format = 'json',
                            fields = fields


In [10]:
# New Stuff
from torchtext.data import Field, TabularDataset, BucketIterator
import spacy
import torch

torch.backends.cudnn.deterministic = True

# spacy_en = spacy.load('en')
# def tokenize(text)z:
#     return [token.text for token in spacy_en.tokenizer(text)]

title = Field(sequential=True, use_vocab=True, tokenize='spacy', lower=True)
text = Field(sequential=True, use_vocab=True, tokenize='spacy', lower=True)
label = Field(sequential=False, use_vocab=False, dtype = torch.float)

fields = {'title': ('title', title), 'text': ('text', text), 'label': ('label', label)}

In [11]:
train_data, test_data = TabularDataset.splits(
    path='',
    train='news.csv',
    test='news.csv',
    format='csv',
    fields=fields)
print("Num of training: ", len(train_data))
print("Num of testing: ", len(test_data))

Num of training:  6335
Num of testing:  6335


In [17]:
train_data, validation_data = train_data.split(split_ratio=0.75)
print("Num of training: ", len(train_data))
print("Num of validation: ", len(validation_data))
print("Num of testing: ", len(test_data))

Num of training:  3563
Num of validation:  1188
Num of testing:  6335


In [13]:
print(train_data[0].__dict__.keys())
print(train_data[0].__dict__.values())

dict_keys(['title', 'text', 'label'])
dict_values([['state', 'that', 'fired', 'pastor', 'demands', 'his', 'sermons', ',', 'notes'], ['state', 'that', 'fired', 'pastor', 'demands', 'his', 'sermons', ',', 'notes', "'", 'this', 'is', 'an', 'excessive', 'display', 'of', 'the', 'government', 'overreaching', 'its', 'authority', "'", 'published', ':', '3', 'mins', 'ago', 'about', '|', '|', 'archive', 'bob', 'unruh', 'joined', 'wnd', 'in', '2006', 'after', 'nearly', 'three', 'decades', 'with', 'the', 'associated', 'press', ',', 'as', 'well', 'as', 'several', 'upper', 'midwest', 'newspapers', ',', 'where', 'he', 'covered', 'everything', 'from', 'legislative', 'battles', 'and', 'sports', 'to', 'tornadoes', 'and', 'homicidal', 'survivalists', '.', 'he', 'is', 'also', 'a', 'photographer', 'whose', 'scenic', 'work', 'has', 'been', 'used', 'commercially', '.', 'print', 'dr.', 'eric', 'walsh', '(', 'photo', ':', 'first', 'liberty', ')', '\n', 'the', 'state', 'of', 'georgia', 'is', 'demanding', 'copie

In [14]:
MAX_VOCAB_SIZE = 25000
text.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
title.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
label.build_vocab(train_data)

In [15]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, validation_iterator, test_iterator = BucketIterator.splits(
    (train_data, validation_data, test_data),
    batch_size=BATCH_SIZE,
    device=device)