In [26]:
import nltk
import re
import numpy as np
import mxnet as mx
import sys, os
import random

from collections import Counter
import itertools
from collections import namedtuple
import math
import time

'''
First job is to clean and preprocess the social media text. (5)

1) Replace URLs and mentions (i.e strings which are preceeded with @)
2) Segment #hastags 
3) Remove emoticons and other unicode characters
'''

def preprocess_tweet(input_text):
    '''
    Input: The input string read directly from the file
    
    Output: Pre-processed tweet text
    '''
    text = input_text
    text = re.sub(r'https?://[A-Za-z0-9./]+','',line[0])
    text = re.sub(r'@[A-Za-z0-9]+','',text)
    text = re.sub(r'RT','',text)
    text = re.sub(r'#([^\s]+)', r'\1', text)
    text = re.sub("[^a-zA-Z]", " ", text)
    cleaned_text = re.sub('(?!^)([A-Z][a-z]+)', r' \1', text)
    return cleaned_text.lower()


# read the input file and create the set of positive examples and negative examples. 

file=open('cancer_data.tsv')
file_out = open("cancer_data_processed.tsv", "a")
pos_data=[]
neg_data=[]

for line in file:
    line=line.strip().split('\t')
    text2= preprocess_tweet(line[0]).strip().split()
    if line[1]=='yes':
        pos_data.append(text2)
    if line[1]=='no':
        neg_data.append(text2)

for line in file:
    line=line.strip().split('\t')
    line[0]= preprocess_tweet(line[0])
    file_out.write(line[0]+'\t'+line[1])

corpus_file = datapath('/home/jaswanth/Coding/DL-2019/assignment_3/cancer_data_processed.tsv')
model_gensim = FT_gensim(size=word_embed_size)
model_gensim.build_vocab(corpus_file=corpus_file)

model_gensim.train(
    corpus_file=corpus_file, epochs=model_gensim.epochs,
    total_examples=model_gensim.corpus_count, total_words=model_gensim.corpus_total_words
)

print(len(pos_data), len(neg_data))     

sentences= list(pos_data)
sentences.extend(neg_data)
pos_labels= [1 for _ in pos_data]
neg_labels= [0 for _ in neg_data]
y=list(pos_labels)
y.extend(neg_labels)
y=np.array(y)

'''
After this you will obtain the following :

1) sentences =  List of sentences having the positive and negative examples with all the positive examples first
2) y = List of labels with the positive labels first.
'''

'''
Before running the CNN there are a few things one needs to take care of: (5)

1) Pad the sentences so that all of them are of the same length
2) Build a vocabulary comprising all unique words that occur in the corpus
3) Convert each sentence into a corresponding vector by replacing each word in the sentence with the index in the vocabulary. 

Example :
S1 = a b a c
S2 = d c a 

Step 1:  S1= a b a c, 
         S2 =d c a </s> 
         (Both sentences are of equal length). 

Step 2:  voc={a:1, b:2, c:3, d:4, </s>: 5}

Step 3:  S1= [1,2,1,3]
         S2= [4,3,1,5]

'''
sequence_length = max(len(x) for x in sentences)
word_vectors = np.zeros((len(sentences),sequence_length,word_embed_size))
def create_word_vectors(sentences):
    '''
    Input: List of sentences
    Output: List of word vectors corresponding to each sentence, vocabulary
    '''
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [""] * num_padding
        padded_sentences.append(new_sentence)
    
    for i in range(len(sentences)):
        for j in range(sequence_length):
            word_vectors[i][j] = model_gensim.wv[sentences[i][j]]
    return word_vectors

x = create_word_vectors(sentences)

def create_shuffle(x,y):
    '''
    Create an equal distribution of the positive and negative examples. 
    Please do not change this particular shuffling method.
    '''
    pos_len= len(pos_data)
    neg_len= len(neg_data)
    pos_len_train= int(0.8*pos_len)
    neg_len_train= int(0.8*neg_len)
    train_data= [(x[i],y[i]) for i in range(0, pos_len_train)]
    train_data.extend([(x[i],y[i]) for i in range(pos_len, pos_len+ neg_len_train )])
    test_data=[(x[i],y[i]) for i in range(pos_len_train, pos_len)]
    test_data.extend([(x[i],y[i]) for i in range(pos_len+ neg_len_train, len(x) )])
    
    random.shuffle(train_data)
    x_train=[i[0] for i in train_data]
    y_train=[i[1] for i in train_data]
    random.shuffle(test_data)
    x_test=[i[0] for i in test_data]
    y_test=[i[1] for i in test_data]
    
    x_train=np.array(x_train)
    y_train=np.array(y_train)
    x_test= np.array(x_test)
    y_test= np.array(y_test)
    return x_train, y_train, x_test, y_test

x_train, y_train, x_test, y_test= create_shuffle(x,y)
sentence_size = x_train.shape[1]

epoch = 5
batch_size = 20
word_embed_size = 200
filters = [2,3,4,5]
lr = 0.005
num_filter = 100

input_x = mx.sym.Variable('data')
input_y = mx.sym.Variable('softmax_label')

conv_input = mx.sym.Reshape(data=input_x, shape=(batch_size, 1, sentence_size, word_embed_size))

pooled_outputs = []
for filter_size in filters:
    convi = mx.sym.Convolution(data=conv_input, kernel=(filter_size,word_embed_size), num_filter=num_filter)
    relui = mx.sym.Activation(data=convi, act_type='relu')
    pooli = mx.sym.Pooling(data=relui, pool_type='max', kernel=(sentence_size - filter_size + 1, 1), stride=(1, 1))
    pooled_outputs.append(pooli)
    
total_filters = num_filter * len(filters)
concat = mx.sym.Concat(*pooled_outputs, dim=1)
h_pool = mx.sym.Reshape(data=concat, shape=(batch_size, total_filters))

num_labels = 2
class_weight = mx.sym.Variable('class_weight')
class_bias = mx.sym.Variable('class_bias')

fully_connected = mx.sym.FullyConnected(data=h_pool, weight=class_weight, bias=class_bias, num_hidden=num_labels)
softmax_output = mx.sym.SoftmaxOutput(data=fully_connected, label=input_y, name='softmax')
CNN = softmax_output

CNNModel = namedtuple("CNNModel", ['cnn_exec', 'symbol', 'data', 'label', 'param_blocks'])
ctx = mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()
arg_names = CNN.list_arguments()

input_shapes = {}
input_shapes['data'] = (batch_size, sentence_size)

arg_shape, out_shape, aux_shape = CNN.infer_shape(**input_shapes)
arg_arrays = [mx.nd.zeros(s, ctx) for s in arg_shape]
args_grad = {}
for shape, name in zip(arg_shape, arg_names):
    if name in ['softmax_label', 'data']:
        continue
    args_grad[name] = mx.nd.zeros(shape, ctx)

cnn_exec = CNN.bind(ctx=ctx, args=arg_arrays, args_grad=args_grad, grad_req='add')

param_blocks = []
arg_dict = dict(zip(arg_names, cnn_exec.arg_arrays))
initializer = mx.initializer.Uniform(0.1)
for i, name in enumerate(arg_names):
    if name in ['softmax_label', 'data']: 
        continue
    initializer(mx.init.InitDesc(name), arg_dict[name])

    param_blocks.append( (i, arg_dict[name], args_grad[name], name) )

data = cnn_exec.arg_dict['data']
label = cnn_exec.arg_dict['softmax_label']

cnn_model= CNNModel(cnn_exec=cnn_exec, symbol=CNN, data=data, label=label, param_blocks=param_blocks)

opt = mx.optimizer.create('rmsprop')
opt.lr = lr

max_grad_norm = 5.0

updater = mx.optimizer.get_updater(opt)

for iteration in range(epoch):
    tic = time.time()
    num_correct = 0
    num_total = 0

    for i in range(0, x_train.shape[0], batch_size):
        batch_X = x_train[i:i+batch_size]
        batch_Y = y_train[i:i+batch_size]
        
        if batch_X.shape[0] != batch_size:
            continue

        cnn_model.data[:] = batch_X
        cnn_model.label[:] = batch_Y

        cnn_model.cnn_exec.forward(is_train=True)

        cnn_model.cnn_exec.backward()

        num_correct += sum(batch_Y == np.argmax(cnn_model.cnn_exec.outputs[0].asnumpy(), axis=1))
        num_total += len(batch_Y)

        norm = 0
        for idx, weight, grad, name in cnn_model.param_blocks:
            grad /= batch_size
            l2_norm = mx.nd.norm(grad).asscalar()
            norm += l2_norm * l2_norm

        norm = math.sqrt(norm)
        for idx, weight, grad, name in cnn_model.param_blocks:
            if norm > max_grad_norm:
                grad *= (max_grad_norm / norm)
            updater(idx, grad, weight)
            grad[:] = 0.0

    toc = time.time()
    train_time = toc - tic
    train_acc = (num_correct * 100)/ float(num_total)

    if (iteration + 1) % 10 == 0:
        prefix = 'cnn'
        cnn_model.symbol.save('./%s-symbol.json' % prefix)
        save_dict = {('arg:%s' % k) : v  for k, v in cnn_model.cnn_exec.arg_dict.items()}
        save_dict.update({('aux:%s' % k) : v for k, v in cnn_model.cnn_exec.aux_dict.items()})
        param_name = './%s-%04d.params' % (prefix, iteration)
        mx.nd.save(param_name, save_dict)

        
    num_correct = 0
    num_total = 0

    for i in range(0, x_test.shape[0], batch_size):
        batch_X = x_test[i:i+batch_size]
        batch_Y = y_test[i:i+batch_size]

        if batch_X.shape[0] != batch_size:
            continue

        cnn_model.data[:] = batch_X
        cnn_model.cnn_exec.forward(is_train=False)

        num_correct += sum(batch_Y == np.argmax(cnn_model.cnn_exec.outputs[0].asnumpy(), axis=1))
        num_total += len(batch_Y)

    test_acc = (num_correct * 100)/float(num_total)
    print('Iter [%d] Train: Time: %.2fs, Training Accuracy: %.2f Test Accuracy: %.2f' %(iteration, train_time, train_acc, test_acc))



208 1298


IndexError: list index out of range