# Load Dataset 

* author: steeve laquitaine

* Workload (1 hour to debug)


In [28]:
import os
import wget
import time
import tarfile
import numpy as np
from pathlib import Path
import pandas as pd

In [55]:
# set data paths  
os.chdir('..') # should be in my_project/notebooks/
data_path = os.getcwd()
raw_data_path       = data_path + "/data/01_raw/"
preprocessed_data_path = data_path + "/data/02_preprocessed/"
url                 = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz' 
download_output     = raw_data_path + "aclImdb_v1.tar.gz"
train_path = preprocessed_data_path + "train.csv"
test_path = preprocessed_data_path + "test.csv"

In [32]:
## download (to debug)
## !! FileNotFoundError: No such file or directory: './data/01_raw/dataset_compressed.tar.gzoul9mi4u.tmp'
# print(url)
# print(download_output)
# wget.download(url, download_output)

In [57]:
def decompress_dataset(data_file, output_folder):
    # decompress dataset file in dataset/ folder
    tic = time.time()
    tar = tarfile.open(data_file)
    tar.extractall(path=output_folder)
    tar.close()
    toc = time.time()
    print(toc - tic)

In [1]:
# decompress
decompress_dataset(download_output, raw_data_path)


In [4]:
# delete compressed file
os.remove(download_output) 

In [51]:
def get_texts(path, CLASSES):
    texts, labels = [], []
    for idx, label in enumerate(CLASSES):
        for fname in (path / label).glob('*.*'):
            texts.append(fname.open('r', encoding='utf-8').read())
            labels.append(idx)
    return np.array(texts), np.array(labels)

def extract_transform_load_dataset(raw_data_path, output_path, timeit=True):
  
    tic = time.time()
    
    # 1 - Init variables
    BOS = 'xbos'  # beginning-of-sentence tag
    FLD = 'xfld'  # data field tag
    CLASSES = ['neg', 'pos', 'unsup']
    col_names = ['sentiment', 'review']
    np.random.seed(42)
    
    # 2 - Build output folders
    PATH = Path(raw_data_path + 'aclImdb/')
    CLAS_PATH = Path(output_path)

    # 3 - Process and store train dataset
    print(PATH)
    trn_texts, trn_labels = get_texts(PATH / 'train', CLASSES)
    print(len(trn_texts))
    print(len(trn_labels))
    df_trn = pd.DataFrame({'review': trn_texts, 'sentiment': trn_labels}, columns=col_names)
    df_trn[df_trn['sentiment'] != 2].to_csv(CLAS_PATH / 'train.csv', index=False)
    
    # 4 - Process and store evaluation dataset
    val_texts, val_labels = get_texts(PATH / 'test', CLASSES)
    df_val = pd.DataFrame({'review': val_texts, 'sentiment': val_labels}, columns=col_names)
    df_val.to_csv(CLAS_PATH / 'test.csv', index=False)
    
    # 5 - Store classes
    (CLAS_PATH / 'classes.txt').open('w', encoding='utf-8').writelines(f'{o}\n' for o in CLASSES)

    toc = time.time()
    print(np.round(toc - tic, 2), ' sec')

In [52]:
# ETL (1 min)
extract_transform_load_dataset(raw_data_path, preprocessed_data_path, timeit = True)

/Users/steeve_laquitaine/Desktop/CodeHub/nlp_txt_similarity/data/01_raw/aclImdb
75000
75000
54.57  sec


In [56]:
def load_dataset(train_path, test_path, sample=5000):
    
    tic = time.time()

    # TRAIN
    train_dataset = pd.read_csv(train_path).sample(n=sample)

    # preview data
    print(train_dataset.head())

    # build train and test datasets
    train_reviews = np.array(train_dataset['review'])
    train_sentiments = np.array(train_dataset['sentiment'])

    # TEST
    test_dataset = pd.read_csv(test_path).sample(n=sample)
    test_reviews = np.array(test_dataset['review'])
    test_sentiments = np.array(test_dataset['sentiment'])
    print('(load_dataset)', time.time() - tic)

    toc = time.time()
    print(('Completed'), toc - tic)

    return train_sentiments, test_sentiments, train_reviews, test_reviews

In [67]:
# load datasets (arrays, 1 sec)
Y_train, Y_test, X_train, X_test = load_dataset(train_path, test_path, sample=5000)


sentiment                                             review
16560          1  I saw this film in the worst possible circumst...
10876          0  SPOILERS THROUGHOUT!!!!<br /><br />I had read ...
15889          1  Time and time again, I've stated that if peopl...
12920          1  One of my favourite films. It has everything -...
13565          1  Maybe I'm a sap but this is the sweetest movie...
(load_dataset) 0.5145790576934814
Completed 0.5147788524627686
