In [1]:
# imports
import wget
import os
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### functions ###

# load file
def download_file(url, filename):
    if not os.path.exists(filename):
        wget.download(url, filename)
    else:
        print(f"{filename} already exists. Skipping download.")

# read text file lines
def read_lines(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return lines

In [3]:
# load data

urlp = "http://dl.turkunlp.org/TKO_7095_2023/imdb-positives.txt"
urln = "http://dl.turkunlp.org/TKO_7095_2023/imdb-negatives.txt"

download_file(urlp, 'imdb-positive.txt')
download_file(urln, 'imdb-negative.txt')

imdb-positive.txt already exists. Skipping download.
imdb-negative.txt already exists. Skipping download.


In [4]:
# create dictionary with both text files

# read text files
pos_lines = read_lines('imdb-positive.txt')
neg_lines = read_lines('imdb-negative.txt')

# create dict of two lists
imdb_dict = {"text": pos_lines + neg_lines, "label": ["positive"] * len(pos_lines) + ["negative"] * len(neg_lines)}

In [5]:
# create Dataset of the dictionary
imdb_ds = datasets.Dataset.from_dict(imdb_dict)

Currently we have a HuggingFace Dataset with text and labels. The first 25 000 instances are positive reviews and the last 25 000 are negative reviews. Next we have to shuffle and split the data into train, validation and test splits with roughly equal splits of positive and negative reviews.

In [6]:
# shuffle dataset
imdb_ds = imdb_ds.shuffle(seed=523834)

In [7]:
# split first into 80/20
imdb_ds = imdb_ds.train_test_split(test_size=0.2)

In [8]:
# then further split the test set into two
imdb_ds["test"] = imdb_ds["test"].train_test_split(test_size=0.5)

In [9]:
# reconstruct DatasetDict
imdb_ds['validation'] = imdb_ds['test']['train']
imdb_ds['test'] = imdb_ds['test']['test']

In [10]:
imdb_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 40000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})