In [5]:
import re
import os
from os import system, listdir
from os.path import isfile, join
from pathlib import Path
from random import shuffle
from string import punctuation

import pandas as pd
from sklearn.model_selection import train_test_split

#system('wget "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"')
#system('tar -xzf "aclImdb_v1.tar.gz"')

In [4]:
cwd = Path(os.getcwd())
repo_root = cwd.parent.parent
aclimdb_folder = os.path.join(repo_root, 'datasets', 'aclImdb')

TRAIN_NEGATIVE_REVIEWS_DIR = os.path.join(aclimdb_folder, 'train', 'neg')
TRAIN_POSITIVE_REVIEWS_DIR = os.path.join(aclimdb_folder, 'train', 'pos')
TEST_NEGATIVE_REVIEWS_DIR = os.path.join(aclimdb_folder, 'test', 'neg')
TEST_POSITIVE_REVIEWS_DIR = os.path.join(aclimdb_folder, 'test', 'pos')


def get_train_valid_data(smoke_test_size=0):
    '''
    Load all the raw negative and positive data from the review files.
    If data is needed for a quick experiment (smoke test) then we want to get an equal amount of files
    from the negative dir and the positive dir.
    '''
    max_files = 0
    if smoke_test_size:
        max_files = smoke_test_size/2

    X_negative, y_negative = read_files(TRAIN_NEGATIVE_REVIEWS_DIR, 0, max_files)
    X_positive, y_positive = read_files(TRAIN_POSITIVE_REVIEWS_DIR, 1, max_files)
    X = X_negative + X_positive
    y = y_negative + y_positive

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, shuffle=True, stratify=y, random_state=42)

    return X_train, y_train, X_valid, y_valid


def get_test_data():
    '''
    Load all the raw negative and positive test data from the review files.
    '''
    X_negative, y_negative = read_files(TEST_NEGATIVE_REVIEWS_DIR, 0)
    X_positive, y_positive = read_files(TEST_POSITIVE_REVIEWS_DIR, 1)
    X_test = X_negative + X_positive
    y_test = y_negative + y_positive

    return X_test, y_test


def read_files(directory, label, max_files=0) -> tuple[list,list]:
    '''
    Retrieve the Imdb data from the specified folder.
    '''
    count = 0
    X = []
    y = []
    for file_name in os.listdir(directory):
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r') as f:
            review = f.read()
        clean = clean_entry(review)
        X.append(clean)
        y.append(label)
        count += 1
        if max_files and count >= max_files:
            break

    return X, y


def clean_entry(review):
    remove_breaks = review.replace('<br />', ' ')
    lower = remove_breaks.lower()
    #for c in punctuation:
    #    lower = lower.replace(c, ' ')
    valid_characters = [c for c in lower if c not in punctuation]
    cleaned = ''.join(valid_characters)
    return cleaned

In [7]:
X_train, y_train, X_valid, y_valid = get_train_valid_data()

print('Length of training set:', len(X_train))


Length of training set: 20000


In [None]:
def create_data_frame(folder: str) -> pd.DataFrame:
    '''
    folder - the root folder of train or test dataset
    Returns: a DataFrame with the combined data from the input folder
    '''
    pos_folder = f'{folder}/pos' # positive reviews
    neg_folder = f'{folder}/neg' # negative reviews
    
    def get_files(fld: str) -> list:
        '''
        fld - positive or negative reviews folder
        Returns: a list with all files in input folder
        '''
        return [join(fld, f) for f in listdir(fld) if isfile(join(fld, f))]
    
    def append_files_data(data_list: list, files: list, label: int) -> None:
        '''
        Appends to 'data_list' tuples of form (file content, label)
        for each file in 'files' input list
        '''
        for file_path in files:
            with open(file_path, 'r') as f:
                text = f.read()
                data_list.append((text, label))
    
    pos_files = get_files(pos_folder)
    neg_files = get_files(neg_folder)
    
    data_list = []
    append_files_data(data_list, pos_files, 1)
    append_files_data(data_list, neg_files, 0)
    shuffle(data_list)
    
    text, label = tuple(zip(*data_list))
    # replacing line breaks with spaces
    text = list(map(lambda txt: re.sub('(<br\s*/?>)+', ' ', txt), text))
    
    return pd.DataFrame({'text': text, 'label': label})



In [8]:

cwd = Path(os.getcwd())
repo_root = cwd.parent.parent
train_folder = os.path.join(repo_root, 'aclImdb', 'train')
test_folder = os.path.join(repo_root, 'aclImdb', 'test')

imdb_train = create_data_frame('aclImdb/train')
imdb_test = create_data_frame('aclImdb/test')

#system("mkdir 'csv'")
imdb_train.to_csv('csv/imdb_train.csv', index=False)
imdb_test.to_csv('csv/imdb_test.csv', index=False)

# imdb_train = pd.read_csv('csv/imdb_train.csv')
# imdb_test = pd.read_csv('csv/imdb_test.csv')

'/Users/keithpij/code/python-sandbox/aclImdb'