In [1]:
import nltk
import re
import numpy
import mxnet as mx
import numpy as np
import sys, os
import random
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/singh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import logging
import sys
root_logger = logging.getLogger()
stdout_handler = logging.StreamHandler(sys.stdout)
root_logger.addHandler(stdout_handler)
root_logger.setLevel(logging.DEBUG)

In [3]:
def preprocess_tweet(input_text):
    '''
    Input: The input string read directly from the file
    
    Output: Pre-processed tweet text
    '''   
    
    link_re = r"http\S+"
    mention_re = r"@\S+"
    emoji_uni_re = r'[\U00010000-\U0010ffff]'
    braces_re = r'[()]'
    del_re = r'|'.join((link_re, mention_re, emoji_uni_re, braces_re))
    ct = re.sub(del_re, '', input_text)
    ct = re.sub(r'[^\x00-\x7F]+','', ct).strip().split()
    cleaned_text = []
    for word in ct:
        if word[-1] == ')':
            word = word[ : -1]
        if len(word) == 0:
            continue
        if word[0] == '(':
            word = word[1 : ]
        if len(word) == 0:
            continue
        if word[0] == '#':
            if len(word) == 1:
                continue
            word = word[1 : ]
            sp = re.findall(r'[0-9A-Z]?[0-9a-z]+|[A-Z]+(?=[0-9A-Z]|$)', word)
            sp = [w.lower() for w in sp]
            cleaned_text += sp
        else:
            cleaned_text.append(word.lower())
    return cleaned_text

In [4]:
file=open('cancer_data.tsv')
pos_data=[]
neg_data=[]
words = []
for line in file:
    line=line.strip().split('\t')
    text2 = preprocess_tweet(line[0])
    if line[1] == 'yes':
        pos_data.append(text2)
    if line[1] == 'no':
        neg_data.append(text2)

In [5]:
print(len(pos_data), len(neg_data))     

sentences = list(pos_data)
sentences.extend(neg_data)
pos_labels = [1 for _ in pos_data]
neg_labels = [0 for _ in neg_data]
y = list(pos_labels)
y.extend(neg_labels)
y = np.array(y)

208 1298


In [6]:
def create_word_vectors(sentences):
    '''
    Input: List of sentences
    Output: List of word vectors corresponding to each sentence, vocabulary
    '''
    max_length = max([len(a) for a in sentences])
    word_vectors = []
    vocabulary = {}
    count = 1
    for sentence in sentences:
        word_vectors.append(sentence + ["</s>"] * (max_length - len(sentence)))
        for word in sentence:
            if word not in vocabulary:
                vocabulary[word] = count
                count = count + 1
    vocabulary["</s>"] = count
    wv = []
    for word_vector in word_vectors:
        wv.append([vocabulary[word] for word in word_vector])
    return np.array(wv), vocabulary

In [9]:
x, vocabulary = create_word_vectors(sentences)
print(x.shape)
vocab_size = len(vocabulary)
sent_size = x.shape[1]
print(x_train.shape)

(1506, 118)
(1204, 118)


In [8]:
def create_shuffle(x,y):
    '''
    Create an equal distribution of the positive and negative examples. 
    Please do not change this particular shuffling method.
    '''
    pos_len= len(pos_data)
    neg_len= len(neg_data)
    pos_len_train= int(0.8*pos_len)
    neg_len_train= int(0.8*neg_len)
    train_data= [(x[i],y[i]) for i in range(0, pos_len_train)]
    train_data.extend([(x[i],y[i]) for i in range(pos_len, pos_len+ neg_len_train )])
    test_data=[(x[i],y[i]) for i in range(pos_len_train, pos_len)]
    test_data.extend([(x[i],y[i]) for i in range(pos_len+ neg_len_train, len(x) )])
    
    random.shuffle(train_data)
    x_train=[i[0] for i in train_data]
    y_train=[i[1] for i in train_data]
    random.shuffle(test_data)
    x_test=[i[0] for i in test_data]
    y_test=[i[1] for i in test_data]
    
    x_train=np.array(x_train)
    y_train=np.array(y_train)
    x_test= np.array(x_test)
    y_test= np.array(y_test)
    return x_train, y_train, x_test, y_test

x_train, y_train, x_test, y_test= create_shuffle(x,y)

In [163]:
def create_sym(sent_size, embed_size=200, filter_list=[2, 3, 4, 5], num_filter=100, dropout=0.0, batch_size=20) :
    input_x = mx.sym.Variable('data')
    input_y = mx.sym.Variable('softmax_label')
    embed_layer = mx.sym.Embedding(data=input_x,
                                  input_dim=vocab_size,
                                  output_dim=embed_size,
                                  name='vocab_embed')
    conv_input = mx.sym.Reshape(data=embed_layer,
                               target_shape=(batch_size, 1, sent_size, embed_size))
    pool_outs = []
    for i, filter_size in enumerate(filter_list):
        convi = mx.sym.Convolution(data=conv_input, kernel=(filter_size, embed_size), num_filter=num_filter)
        relui = mx.sym.Activation(data=convi, act_type='relu')
        pooli = mx.sym.Pooling(data=relui, pool_type='max', kernel=(sent_size - filter_size + 1, 1), stride=(1, 1))
        pool_outs.append(pooli)
    tot_filters = num_filter * len(filter_list)
    concat = mx.sym.Concat(*pool_outs, dim=1)
    h_pool = mx.sym.Reshape(data = concat, target_shape=(batch_size, tot_filters))
    if dropout > 0.0:
        h_drop = mx.sym.Dropout(data=h_pool, p=dropout)
    else:
        h_drop = h_pool
    
    fc_weight = mx.sym.Variable('fc_weight')
    fc_bias = mx.sym.Variable('fc_bias')
    
    fc = mx.sym.FullyConnected(data=h_drop, weight=fc_weight, bias=fc_bias, num_hidden=2)
    
    sm = mx.sym.SoftmaxOutput(data=fc, label=input_y, name='softmax')
    
    return sm, ('data', ), ('softmax_label', )

In [164]:
def train(sym_data, train_iterator, test_iterator, names_data, names_label):
    module = mx.mod.Module(sym_data, data_names=names_data, label_names=names_label)
    module.fit(train_data=train_iterator,
              eval_data=test_iterator,
              eval_metric="acc",
              optimizer='RMSProp',
              optimizer_params={'learning_rate': 0.005},
              initializer=mx.initializer.Xavier(),
              num_epoch=10,
              batch_end_callback=mx.callback.Speedometer(20, 60)
              )
    return module

In [165]:
train_set = mx.io.NDArrayIter(x_train, y_train, batch_size=20)
test_set = mx.io.NDArrayIter(x_test, y_test, batch_size=20)
sym_data, names_data, names_label = create_sym(sent_size)



In [None]:
model = train(sym_data, train_set, test_set, names_data, names_label)

Epoch[0] Batch [60]	Speed: 168.85 samples/sec	accuracy=0.868852
Epoch[0] Batch [60]	Speed: 168.85 samples/sec	accuracy=0.868852
Epoch[0] Train-accuracy=0.868852
Epoch[0] Train-accuracy=0.868852
Epoch[0] Time cost=7.214
Epoch[0] Time cost=7.214
Epoch[0] Validation-accuracy=0.890625
Epoch[0] Validation-accuracy=0.890625
Epoch[1] Batch [60]	Speed: 162.77 samples/sec	accuracy=0.952459
Epoch[1] Batch [60]	Speed: 162.77 samples/sec	accuracy=0.952459
Epoch[1] Train-accuracy=0.952459
Epoch[1] Train-accuracy=0.952459
Epoch[1] Time cost=7.436
Epoch[1] Time cost=7.436


In [None]:
train_outs = model.predict(train_set)
test_outs = model.predict(test_set)

In [None]:
train_acc = 100 * sum(y_train == np.argmax(train_outs, axis=1).asnumpy()) / len(y_train) 
test_acc = 100 * sum(y_test == np.argmax(test_outs, axis=1).asnumpy()) / len(y_test) 
print(train_acc, test_acc)

In [None]:
def create_sym_ft(sent_size, embed_size=200, filter_list=[2, 3, 4, 5], num_filter=100, dropout=0.0, batch_size=20) :
    input_x = mx.sym.Variable('data')
    input_y = mx.sym.Variable('softmax_label')
    embed_layer = mx.contrib.text.embedding.create('fasttext', pretrained_file_name='wiki.simple.vec', vocabulary=vocabulary)
    conv_input = mx.sym.Reshape(data=embed_layer,
                               target_shape=(batch_size, 1, sent_size, embed_size))
    pool_outs = []
    for i, filter_size in enumerate(filter_list):
        convi = mx.sym.Convolution(data=conv_input, kernel=(filter_size, embed_size), num_filter=num_filter)
        relui = mx.sym.Activation(data=convi, act_type='relu')
        pooli = mx.sym.Pooling(data=relui, pool_type='max', kernel=(sent_size - filter_size + 1, 1), stride=(1, 1))
        pool_outs.append(pooli)
    tot_filters = num_filter * len(filter_list)
    concat = mx.sym.Concat(*pool_outs, dim=1)
    h_pool = mx.sym.Reshape(data = concat, target_shape=(batch_size, tot_filters))
    if dropout > 0.0:
        h_drop = mx.sym.Dropout(data=h_pool, p=dropout)
    else:
        h_drop = h_pool
    
    fc_weight = mx.sym.Variable('fc_weight')
    fc_bias = mx.sym.Variable('fc_bias')
    
    fc = mx.sym.FullyConnected(data=h_drop, weight=fc_weight, bias=fc_bias, num_hidden=2)
    
    sm = mx.sym.SoftmaxOutput(data=fc, label=input_y, name='softmax')
    
    return sm, ('data', ), ('softmax_label', )

In [None]:
from mxnet.contrib import text
import itertools
from collections import Counter


In [None]:
counts = Counter(itertools.chain(*sentences))
my_vocab = text.vocab.Vocabulary(counts)
my_embedding = text.embedding.create('fasttext', pretrained_file_name='wiki.simple.vec', vocabulary=my_vocab)

In [None]:
embeds = [my_embedding.get_vecs_by_tokens(sentence) for sentence in sentences]

In [None]:
embeds = np.array(embeds)

In [None]:
embeds

In [None]:
re.sub(r'[()]', '','I wanted to (say) something(s) (but I cannot)')