In [1]:
import os
import random
import numpy as np

from collections import namedtuple

from sklearn.preprocessing import LabelEncoder

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


# Movie reviews with one sentence per review

In [2]:
# Movie reviews with one sentence per review.
# https://www.cs.cornell.edu/people/pabo/movie-review-data/

# fold 1: files tagged cv000 through cv099, in numerical order
# fold 2: files tagged cv100 through cv199, in numerical order
# fold 10: files tagged cv900 through cv999, in numerical order

x_folds = [list() for _ in range(10)]
y_folds = [list() for _ in range(10)]

for directory in ['MR/txt_sentoken/pos/','MR/txt_sentoken/neg/'] :
    for filename in os.listdir(directory) + os.listdir(directory):
        label = directory.split("/")[-2]        
        fold = int(filename.split("_")[0][2])
        with open(directory+filename,'r') as f_input:
            for line in f_input:
                x_folds[fold].append(line.strip())
                y_folds[fold].append(label)

In [3]:
# convert list of tokens/words to indexes
x_all = [sentence for fold in x_folds for sentence in fold]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_all)
sequences_train = tokenizer.texts_to_sequences(x_all)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 43296 unique tokens.


In [4]:
# get the max sentence lenght, needed for padding
max_input_lenght = max([len(x) for x in x_all])
print("Max. sequence lenght: ", max_input_lenght)

Max. sequence lenght:  887


In [5]:
# pad all the sequences of indexes to the 'max_input_lenght'
x_folds_padded = []
for fold in x_folds:
    tokenized_fold = tokenizer.texts_to_sequences(fold)
    x_folds_padded.append(pad_sequences(tokenized_fold, maxlen=max_input_lenght, padding='post', truncating='post'))

In [102]:
# Encode the labels, each must be a vector with dim = num. of possible labels
le = LabelEncoder()
le.fit(y_folds[0])

y_folds_categ = []

for y_fold in y_folds:
    y_categ = le.transform(y_fold)
    y_folds_categ.append(to_categorical(y_categ, num_classes=None))