In [1]:
import os
import re
from collections import Counter
from string import punctuation
import numpy as np
import pandas as pd
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tree import Tree
from sklearn.linear_model import LogisticRegression
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences

  from ._conv import register_converters as _register_converters


In [2]:
pos_dir = 'train/pos'
neg_dir = 'train/neg'
test_dir = 'test'
pos_fs = os.listdir(pos_dir)
pos_fs = list(filter(lambda x: '.txt' in x, pos_fs))
neg_fs = os.listdir(neg_dir)
neg_fs = list(filter(lambda x: '.txt' in x, neg_fs))
test_fs = os.listdir(test_dir)
test_fs = list(filter(lambda x: '.txt' in x, test_fs))

pos_n = len(pos_fs)
neg_n = len(neg_fs)
test_n = len(test_fs)
print(pos_n, neg_n, test_n)

12500 12500 11000


In [3]:
trainX, trainY = None, None

# Fetch rates
rates_pos = np.array(list(map(lambda x: int(re.search('_([0-9]+).txt', x).group(1)), pos_fs)))
rates_neg = np.array(list(map(lambda x: int(re.search('_([0-9]+).txt', x).group(1)), neg_fs)))

word_set = set()
word_freq = Counter()
word_int = dict()

train_token_list = []
test_token_list = []


In [5]:
unknown_threshold = 3

In [6]:
# Handle Special words case
special = {
    'unknown': '<unk>',
    'number': '<nmb>',
    'punctuation': '<pun>',
    'or': '<or>',
    'rate': '<rat>',
}

puncs = {
    '.': '<dot>',
    ',': '<com>',
    '!': '<exc>',
    '?': '<que>',
    '"': '<quo>',
    '``': '<quo>',
    '\'\'': '<quo>'
}

# Use ne_chunk to handle NER
ner_symbols = {
    'FACILITY': '<fac>',
    'GPE': '<gpe>',
    'GSP': '<gsp>',
    'LOCATION': '<loc>',
    'ORGANIZATION': '<org>',
    'PERSON': '<per>'
}

In [7]:
# Word preprocessing
word_lemmatizer = WordNetLemmatizer()


def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']


def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']


def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']


def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']


def penn_to_wn(tag):
    if is_adjective(tag):
        return wn.ADJ
    elif is_noun(tag):
        return wn.NOUN
    elif is_adverb(tag):
        return wn.ADV
    elif is_verb(tag):
        return wn.VERB
    return None


def lemmatize(word_lemmatizer, pos_tagged):
    words = [x[0] for x in pos_tagged]
    poses = [x[1] for x in pos_tagged]
    wns = list(map(lambda x: penn_to_wn(x), poses))
    lemmatized = list(map(lambda args: (args[0] if args[2] is None 
                    else word_lemmatizer.lemmatize(args[0], args[2]), args[1]), zip(words, poses, wns)))
    return lemmatized

In [8]:
def get_ner(pos_tagged):
    chunks = ne_chunk(pos_tagged)
    idx = 0
    ner_info = []
    ner_name = None
    for i in chunks:
        step = 1
        if type(i) == Tree:
            info = str(i)[1:-1].split()
            cur_ner = info[0]
            step = len(info[1:])
            # Consolidate consecutive same ner    
            if cur_ner == ner_name:
                ner_info[-1][2] += step
            else:
                ner_info.append([cur_ner, idx, idx+step])
            ner_name = cur_ner
        else:
            ner_name = None
        idx += step

    ner_dict = {l:(r, ner) for ner, l, r in ner_info}
    return ner_dict


def replace_ner(pos_tagged, ner_dict, ner_symbols):
    modified = []
    idx = 0
    while idx < len(pos_tagged):
        t, p = pos_tagged[idx]
        if idx in ner_dict:
            r, ner = ner_dict[idx]
            modified.append((ner_symbols[ner], p))
            idx = r
        else:
            modified.append((t, p))
            idx += 1
    return modified

In [9]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

    
def is_punc(s):
    is_punc = True
    for c in s:
        if c not in punctuation:
            return False
    return True 


def sent_to_tokens(sent, tokens=None):
    if not tokens:
        tokens = word_tokenize(sent.strip().lower())
    res = []
    for t in tokens:
        if is_number(t):
            res.append(special['number'])
        elif t in puncs:
            res.append(puncs[t])
        elif is_punc(t):
            res.append(special['punctuation'])
        elif re.match(r'[0-9]/[0-9]', t) is not None:
            res.append(special['rate'])
        elif re.match(r'[0-9]+/[0-9]+', t) is not None:
            res.append('<nmb>/<nmb>')
        elif t.find('/') != len(t)-1 and t.find('/') != 0:
            for st in t.split('/'):
                res.append(st)
                res.append(special['or'])
            del res[-1]
        else:
            res.append(t)
    res2 = []
    for t in res:
        if t.find('.') !=len(t)-1 and t.find('.') != 0:      
            for st in t.split('.'):
                res2.append(st)
                res2.append(special['or'])
            del res2[-1]
        else:
            res2.append(t)
    
    res3 = []
    for t in res2:
        if t.find('-') !=len(t)-1 and t.find('-') != 0:
            s = ''
            for st in t.split('-'):
                if is_number(st):
                    s += special['number']
                else:
                    s += st
                s+='-'
            s = s[:-1]
            res3.append(s)
        else:
            res3.append(t)
    
    res4 = []
    for t in res3:
        if t.find('~') !=len(t)-1 and t.find('~') != 0:
            s = ''
            for st in t.split('~'):
                if is_number(st):
                    s += special['number']
                else:
                    s += st
                s+='~'
            s = s[:-1]
            res4.append(s)
        else:
            res4.append(t) 
    return res4


def process_sent(sent):
    # Preprocess sentence
    
    # Remove html tags 
    text = re.sub(r'<[a-zA-Z\s]*/>', ' ', sent)
    # Replace single quotes with double quotes
    text = re.sub(r'\'([a-zA-Z0-9][a-zA-Z0-9\s]*[a-zA-Z0-9])\'', r'"\1"', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # POS Tagging
    pos_tagged = pos_tag(tokens)
    
    # Extract NER 
    ner_dict = get_ner(pos_tagged)
    
    # Replace NER with unified symbols
    modified_pos_tagged = replace_ner(pos_tagged, ner_dict, ner_symbols)
    
    # Lemmatize (verb present tense, plural->singular etc.)
    lemmatized = lemmatize(word_lemmatizer, modified_pos_tagged)
    lemmed_tokens = [x[0].lower() for x in lemmatized] 
    # Number, punctuation, etc. special case handling
    
    res = sent_to_tokens(sent, tokens=lemmed_tokens)
    return res

In [10]:
sent = """Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!"""
# Remove html tags 
# sent = re.sub(r'<[a-zA-Z\s]*/>', ' ', sent)
#     # Replace single quotes with double quotes
# sent = re.sub(r'\'([a-zA-Z0-9][a-zA-Z0-9\s]*[a-zA-Z0-9])\'', r'"\1"', sent)
    
# tokened = word_tokenize(sent)
# pos_tagged = pos_tag(tokened)
# pos_tagged
process_sent(sent)

['<per>',
 '<org>',
 'be',
 'a',
 'cartoon',
 'comedy',
 '<dot>',
 'it',
 'run',
 'at',
 'the',
 'same',
 'time',
 'as',
 'some',
 'other',
 'program',
 'about',
 'school',
 'life',
 '<com>',
 'such',
 'as',
 '<quo>',
 'teachers',
 '<quo>',
 '<dot>',
 'my',
 '<nmb>',
 'year',
 'in',
 'the',
 'teaching',
 'profession',
 'lead',
 'me',
 'to',
 'believe',
 'that',
 '<per>',
 "'s",
 'satire',
 'be',
 'much',
 'closer',
 'to',
 'reality',
 'than',
 'be',
 '<quo>',
 'teachers',
 '<quo>',
 '<dot>',
 'the',
 'scramble',
 'to',
 'survive',
 'financially',
 '<com>',
 'the',
 'insightful',
 'student',
 'who',
 'can',
 'see',
 'right',
 'through',
 'their',
 'pathetic',
 'teacher',
 '<pun>',
 'pomp',
 '<com>',
 'the',
 'pettiness',
 'of',
 'the',
 'whole',
 'situation',
 '<com>',
 'all',
 'remind',
 'me',
 'of',
 'the',
 'school',
 'i',
 'know',
 'and',
 'their',
 'student',
 '<dot>',
 'when',
 'i',
 'saw',
 'the',
 'episode',
 'in',
 'which',
 'a',
 'student',
 'repeatedly',
 'try',
 'to',
 'burn

In [11]:
def process_files(dir, files, token_list, is_train=False):
    paths = list(map(lambda x: os.path.join(dir, x), files))
    for p in paths:
        with open(p, 'r') as f:
            review = f.read()
            tokens = process_sent(review)
            token_list.append(tokens)
            if is_train:
                word_freq.update(Counter(tokens))
            
            
process_files(pos_dir, pos_fs, train_token_list, True)
process_files(neg_dir, neg_fs, train_token_list, True)
process_files(test_dir, test_fs, test_token_list, False)

In [14]:
len(test_token_list)

11000

In [12]:
def word_to_int(word_freq):
    word_int = dict()
    idx = 1
    for w in word_freq:
        if word_freq[w] <= unknown_threshold:
            word_int[w] = 0
        else:
            word_int[w] = idx
            idx += 1
    for w in word_int:
        if word_int[w] == 0:
            word_int[w] = idx
    return word_int

word_int = word_to_int(word_freq)
len(word_int)

68832

In [42]:
print(train_token_list[12500])

['story', 'of', 'a', 'man', 'who', 'have', 'unnatural', 'feeling', 'for', 'a', 'pig', '<dot>', 'starts', 'out', 'with', 'a', 'opening', 'scene', 'that', 'be', 'a', 'terrific', 'example', 'of', 'absurd', 'comedy', '<dot>', 'a', 'formal', 'orchestra', 'audience', 'be', 'turn', 'into', 'an', 'insane', '<com>', 'violent', 'mob', 'by', 'the', 'crazy', 'chanting', 'of', 'it', "'s", 'singer', '<dot>', 'unfortunately', 'it', 'stay', 'absurd', 'the', '<org>', 'time', 'with', 'no', 'general', 'narrative', 'eventually', 'make', 'it', 'just', 'too', 'off', 'put', '<dot>', 'even', 'those', 'from', 'the', 'era', 'should', 'be', 'turn', 'off', '<dot>', 'the', 'cryptic', 'dialogue', 'would', 'make', '<per>', 'seem', 'easy', 'to', 'a', 'third', 'grader', '<dot>', 'on', 'a', 'technical', 'level', 'it', "'s", 'good', 'than', 'you', 'might', 'think', 'with', 'some', 'good', 'cinematography', 'by', 'future', 'great', '<per>', '<dot>', '<per>', 'star', '<per>', 'and', '<per>', 'can', 'be', 'see', 'briefly',

In [51]:
max_idx = 0
for k, v in word_int.items():
    if v > max_idx:
        max_idx = v
print('Max index (Vocabulary size) is {0}'.format(max_idx))

Max index (Vocabulary size) is 21582


In [18]:
def token_to_sequence(tokens, word_int, max_idx):
    res = []
    for t in tokens:
        if t not in word_int:
            res.append(max_idx)
        else:
            res.append(word_int[t])
    return res


def batch_sequence(token_list):
    x = []
    for tokens in token_list:
        x.append(token_to_sequence(tokens, word_int, max_freq))
    return x

In [19]:
trainX = batch_sequence(train_token_list)

testX = batch_sequence(test_token_list)

trainY = np.append(np.ones(pos_n, dtype=np.int), np.zeros(neg_n, dtype=np.int))

assert len(trainX) == pos_n+neg_n
assert len(testX) == test_n
assert len(trainY) == pos_n+neg_n

In [20]:
# Padding post zeros for features
# One-hot encoding for Label
trainX = pad_sequences(trainX)
n, d = trainX.shape
testX = pad_sequences(testX, maxlen=d)

nb_classes = len(np.unique(trainY))
trainY = to_categorical(trainY, nb_classes=nb_classes)

In [23]:
# Shuffle the train data
indices = np.arange(n)
np.random.shuffle(indices)
trainX = trainX[indices]
trainY = trainY[indices]

In [44]:
# Cross validation
split_frac = 0.8
split_index = int(n * split_frac)

train_X = trainX[:split_index]
val_X = trainX[split_index:]
test_X = testX

train_Y = trainY[:split_index]
val_Y = trainY[split_index:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_X.shape), 
      "\nValidation set: \t{}".format(val_X.shape),
      "\nTest set: \t\t{}".format(test_X.shape))
print("label set: \t\t{}".format(train_Y.shape), 
      "\nValidation label set: \t{}".format(val_Y.shape))

			Feature Shapes:
Train set: 		(20000, 2649) 
Validation set: 	(5000, 2649) 
Test set: 		(11000, 2649)
label set: 		(20000, 2) 
Validation label set: 	(5000, 2)


In [53]:
# Network parameter dimensions setup

lstm_size = [256, 512]
lstm_layers = 2
batch_size = 500
learning_rate = 0.001
embedding_dim = 300
n, d = train_X.shape

In [55]:
# Network building

net = tflearn.input_data([None, d])
net = tflearn.embedding(net, input_dim=max_idx, output_dim=embedding_dim)
net = tflearn.lstm(net, n_units=lstm_size[0], dropout=0.8, return_seq=True, weights_init='xavier')
net = tflearn.lstm(net, n_units=lstm_size[1], dropout=0.8, weights_init='xavier')
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy')

# Training
with tf.device('/gpu:0'):
    model = tflearn.DNN(net, tensorboard_verbose=0)
    model.fit(train_X, train_Y, validation_set=(val_X, val_Y), show_metric=True,
          batch_size=256, snapshot_epoch=True) 


KeyboardInterrupt: 