In [1]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras

In [2]:
import os

In [5]:
# split the test set into validation set (15,000) and test set (10,000)

os.listdir(os.getcwd())

['question_9_ch13.ipynb',
 'my_logs',
 'keras_mnist.ipynb',
 '.DS_Store',
 'ensemble_learning.ipynb',
 'question10_chapter12.ipynb',
 'test.csv',
 'datasets',
 'submission.csv',
 'tackle_titanic.ipynb',
 'dim_reduct.ipynb',
 '.gitignore',
 'fashion_mnist_keras_model.h5',
 'tackle_titanic_final_clean_maybe.ipynb',
 'titanic.zip',
 'train.csv',
 '.ipynb_checkpoints',
 'decision_trees.ipynb',
 'venv',
 '.git',
 'practice',
 'tackle_titanic_2.ipynb',
 'gender_submission.csv',
 'clustering.ipynb']

In [6]:
os.listdir(os.path.join(os.getcwd(), 'datasets'))

['.DS_Store',
 'fashion_mnist',
 'aclImdb',
 'aclImdb_v1.tar.gz',
 '.ipynb_checkpoints']

In [7]:
large_movieset_path = os.path.join(os.getcwd(), 'datasets/aclImdb')

os.listdir(large_movieset_path)

['imdbEr.txt', 'test', 'imdb.vocab', 'README', 'train']

In [8]:
testset_folder = os.path.join(large_movieset_path, 'test')
trainset_folder = os.path.join(large_movieset_path, 'train')

In [9]:
os.listdir(testset_folder)

['neg', 'urls_pos.txt', 'urls_neg.txt', 'pos', 'labeledBow.feat']

In [10]:
os.listdir(trainset_folder)

['urls_unsup.txt',
 'neg',
 'urls_pos.txt',
 'unsup',
 'urls_neg.txt',
 'pos',
 'unsupBow.feat',
 'labeledBow.feat']

In [11]:
test_pos_reviews_folder = os.path.join(testset_folder, 'pos')
test_neg_reviews_folder = os.path.join(testset_folder, 'neg')

train_pos_reviews_folder = os.path.join(trainset_folder, 'pos')
train_neg_reviews_folder = os.path.join(trainset_folder, 'neg')


In [12]:
len(os.listdir(test_pos_reviews_folder)), len(os.listdir(test_neg_reviews_folder))

(12500, 12500)

In [13]:
len(os.listdir(train_pos_reviews_folder)), len(os.listdir(train_neg_reviews_folder))

(12500, 12500)

In [14]:

""" I need to select 15000 random txt files from test_pos_reviews_folder and 
    test_neg_reviews_folder -> moves these to a validation folders. ok let's
    do this. """

""" create validset_folder"""
validset_folder = os.path.join(large_movieset_path, 'valid')
os.makedirs(validset_folder, exist_ok=True)

"""create pos and neg folder under validset_folder"""
valid_pos_reviews_folder = os.path.join(validset_folder, 'pos')
valid_neg_reviews_folder = os.path.join(validset_folder, 'neg')

os.makedirs(valid_pos_reviews_folder, exist_ok=True)
os.makedirs(valid_neg_reviews_folder, exist_ok=True)


In [17]:
import numpy as np

""" select 15000 (7500 + 7500) random .txt files from the test set """
valid_pos_filenames = np.random.choice(os.listdir(test_pos_reviews_folder),
                                       size=7500, replace=False)
valid_neg_filenames = np.random.choice(os.listdir(test_neg_reviews_folder),
                                       size=7500, replace=False)


In [18]:
valid_pos_filenames[0:5]

array(['1623_8.txt', '10984_10.txt', '7256_10.txt', '5166_10.txt',
       '5754_10.txt'], dtype='<U12')

In [22]:
import shutil 

def move_from_test_to_valid(from_folder, to_folder, list_of_filenames):
    for filename in list_of_filenames:
        shutil.move(os.path.join(from_folder, filename), os.path.join(to_folder, filename))
        
move_from_test_to_valid(test_pos_reviews_folder, valid_pos_reviews_folder, valid_pos_filenames)
move_from_test_to_valid(test_neg_reviews_folder, valid_neg_reviews_folder, valid_neg_filenames)

In [23]:
len(os.listdir(train_pos_reviews_folder)), len(os.listdir(train_neg_reviews_folder))

(12500, 12500)

In [24]:
len(os.listdir(test_pos_reviews_folder)), len(os.listdir(test_neg_reviews_folder))

(5000, 5000)

In [25]:
len(os.listdir(valid_pos_reviews_folder)), len(os.listdir(valid_neg_reviews_folder))

(7500, 7500)

In [47]:
# use tf.data to create an efficient dataset for each set
# tf.data.Dataset.from_tensor_slices(tf.constant(reviews), tf.constant(labels))

def create_dataset(positive_reviews_dir, negative_reviews_dir):
    reviews, labels = [], []
    def iterate_through_files(my_dir, label):
        for filename in os.listdir(my_dir):
            with open(os.path.join(my_dir, filename)) as f:
                file_content = f.read()
            reviews.append(file_content)
            labels.append(label)
    iterate_through_files(positive_reviews_dir, 1)
    iterate_through_files(negative_reviews_dir, 0)
    
    dataset = tf.data.Dataset.from_tensor_slices((tf.constant(reviews),
                                               tf.constant(labels)))
    
    return dataset
    

In [48]:
trainset = create_dataset(train_pos_reviews_folder, train_neg_reviews_folder)
testset = create_dataset(test_pos_reviews_folder, test_neg_reviews_folder)
validset = create_dataset(valid_pos_reviews_folder, valid_neg_reviews_folder)

In [49]:
for review, label in trainset.take(1):
    print(review)
    print(label)

tf.Tensor(b'For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer. The Moroni character is an absolute scream. Watch for Alan "The Skipper" Hale jr. as a police Sgt.', shape=(), dtype=string)
tf.Tensor(1, shape=(), dtype=int32)


In [51]:
shuffle_len = int(trainset.__len__())

trainset = trainset.shuffle(shuffle_len)

In [52]:
for review, label in trainset.take(5):
    print(review, label)
    print('-'*20)

tf.Tensor(b"Ya I rented it, so shoot me!<br /><br />A decent premise sets up an otherwise awkward story with no real payoff, but at least it's shot well. Director Jon Keeyes takes the simple idea of a fake haunted house with real danger inside. In most cases this should be a slam dunk, but this little stinker derails quite quickly. The cinematography is above average and the acting is mediocre at best, but the story and writing is just plain awful. Slower scenes drag on forever and the scares are too few and far in between with no real climax to the film. An eerie mood is set at the beginning but loses it's luster before any type of horror transpires, and I found myself bored to death and making another sandwich... The cover art is appealing and I suppose it's worth a rental if you're looking for mindless low budget dreck, but if you enjoy a good story and eventful ending, reach for something else.", shape=(), dtype=string) tf.Tensor(0, shape=(), dtype=int32)
--------------------
tf.Te

In [54]:
""" If the datasets didn't fit in memorywe would have to preprocess the input files
    (e.g., converting them to TFRecords). For very large datasets, it would make
    sense a tool like Apache Beam or Spark for that."""

batch_size = 32
trainset = trainset.batch(batch_size).prefetch(1)
validset = validset.batch(batch_size).prefetch(1)
testset = testset.batch(batch_size).prefetch(1)