In [0]:
import re
def token(sentence, remove_vowels=False, remove_repeat=False, minchars=2):
    tokens = []
#   for t in re.findall("[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\w]+",sentence.lower()):
    for t in re.findall("[a-zA-Z]+",sentence.lower()):

        if len(t)>=minchars:
            if remove_vowels:
                t=removeVovels(t)
            if remove_repeat:
                t=removeRepeat(t)
            tokens.append(t)
    return tokens

VOWELS = ['a', 'e', 'i', 'o', 'u']

def removeRepeat(string):
    return re.sub(r'(.)\1+', r'\1\1', string)     

def removeVovels(string):
    return ''.join([l for l in string.lower() if l not in VOWELS])

if __name__ == '__main__':
    pass

def normalize_matrix(matrix):
    pass

In [0]:
import numpy as np
import h5py
import pickle
from copy import deepcopy
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.preprocessing import sequence
from keras import backend as K
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.utils import np_utils

In [0]:
################# GLOBAL VARIABLES #####################
#Filenames
#TODO: Add to coding conventions that directories are to always end with '/'
Masterdir = '/content/drive/My Drive/CMSA/'
Datadir = 'Data/'
Modeldir = 'Models/'
Featuredir = 'Features/'
inputdatasetfilename = 'IIITH_Codemixed.txt'
exp_details = 'new_experiment'

#Data I/O formatting
SEPERATOR = '\t'
DATA_COLUMN = 1
LABEL_COLUMN = 3
LABELS = ['0','1','2'] # 0 -> Negative, 1-> Neutral, 2-> Positive
mapping_char2num = {}
mapping_num2char = {}
MAXLEN = 200

#LSTM Model Parameters
#Embedding
MAX_FEATURES = 0
embedding_size = 128
# Convolution
filter_length = 3
nb_filter = 128
pool_length = 3
# LSTM
lstm_output_size = 128
# Training
batch_size = 128
number_of_epochs = 50
numclasses = 3
test_size = 0.2
########################################################

In [0]:
def parse(Masterdir,filename,seperator,datacol,labelcol,labels):
	"""
	Purpose -> Data I/O
	Input   -> Data file containing sentences and labels along with the global variables
	Output  -> Sentences cleaned up in list of lists format along with the labels as a numpy array
	"""
	#Reads the files and splits data into individual lines
	f=open(Masterdir+Datadir+filename,'r')
	lines = f.read().lower()
	lines = lines.lower().split('\n')[:-1]

	X_train = []
	Y_train = []
	
	#Processes individual lines
	for line in lines:
		# Seperator for the current dataset. Currently '\t'. 
		line = line.split(seperator)
		#Token is the function which implements basic preprocessing as mentioned in our paper
		tokenized_lines = token(line[datacol])
		
		#Creates character lists
		char_list = []
		for words in tokenized_lines:
			for char in words:
				char_list.append(char)
			char_list.append(' ')
		#print(char_list) - Debugs the character list created
		X_train.append(char_list)
		
		#Appends labels
		if line[labelcol] == labels[0]:
			Y_train.append(0)
		if line[labelcol] == labels[1]:
			Y_train.append(1)
		if line[labelcol] == labels[2]:
			Y_train.append(2)
	
	#Converts Y_train to a numpy array	
	Y_train = np.asarray(Y_train)
	assert(len(X_train) == Y_train.shape[0])

	return [X_train,Y_train]

In [0]:
def convert_char2num(mapping_n2c,mapping_c2n,trainwords,maxlen):
	"""
	Purpose -> Convert characters to integers, a unique value for every character
	Input   -> Training data (In list of lists format) along with global variables
	Output  -> Converted training data along with global variables
	"""
	allchars = []
	errors = 0

	#Creates a list of all characters present in the dataset
	for line in trainwords:
		try:
			allchars = set(allchars+line)
			allchars = list(allchars)
		except:
			errors += 1

	#print(errors) #Debugging
	#print(allchars) #Debugging 

	#Creates character dictionaries for the characters
	charno = 0
	for char in allchars:
		mapping_char2num[char] = charno
		mapping_num2char[charno] = char
		charno += 1

	assert(len(allchars)==charno) #Checks

	#Converts the data from characters to numbers using dictionaries 
	X_train = []
	for line in trainwords:
		char_list=[]
		for letter in line:
			char_list.append(mapping_char2num[letter])
		#print(no) -- Debugs the number mappings
		X_train.append(char_list)
	print(mapping_char2num)
	print(mapping_num2char)
	#Pads the X_train to get a uniform vector
	#TODO: Automate the selection instead of manual input
	X_train = sequence.pad_sequences(X_train[:], maxlen=maxlen)
	return [X_train,mapping_num2char,mapping_char2num,charno]

In [0]:
def RNN(X_train,y_train,args):
	"""
	Purpose -> Define and train the proposed LSTM network
	Input   -> Data, Labels and model hyperparameters
	Output  -> Trained LSTM network
	"""
	#Sets the model hyperparameters
	#Embedding hyperparameters
	max_features = args[0]
	maxlen = args[1]
	embedding_size = args[2]
	# Convolution hyperparameters
	filter_length = args[3]
	nb_filter = args[4]
	pool_length = args[5]
	# LSTM hyperparameters
	lstm_output_size = args[6]
	# Training hyperparameters
	batch_size = args[7]
	nb_epoch = args[8]
	numclasses = args[9]
	test_size = args[10] 

	#Format conversion for y_train for compatibility with Keras
	y_train = np_utils.to_categorical(y_train, numclasses) 
	#Train & Validation data splitting
	X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=test_size, random_state=42)
	
	#Build the sequential model
	# Model Architecture is:
	# Input -> Embedding -> Conv1D+Maxpool1D -> LSTM -> LSTM -> FC-1 -> Softmaxloss
	print('Build model...')
	model = Sequential()
	model.add(Embedding(max_features, embedding_size, input_length=maxlen))
	model.add(Convolution1D(nb_filter=nb_filter,
							filter_length=filter_length,
							border_mode='valid',
							activation='relu',
							subsample_length=1))
	model.add(MaxPooling1D(pool_length=pool_length))
	model.add(LSTM(lstm_output_size, dropout_W=0.2, dropout_U=0.2, return_sequences=True))
	model.add(LSTM(lstm_output_size, dropout_W=0.2, dropout_U=0.2, return_sequences=False))
	model.add(Dense(numclasses))
	model.add(Activation('softmax'))

	# Optimizer is Adamax along with categorical crossentropy loss
	model.compile(loss='categorical_crossentropy',
			  	optimizer='adamax',
			  	metrics=['accuracy'])
	

	print('Train...')
	#Trains model for 50 epochs with shuffling after every epoch for training data and validates on validation data
	model.fit(X_train, y_train, 
			  batch_size=batch_size, 
			  shuffle=True, 
			  nb_epoch=nb_epoch,
			  validation_data=(X_valid, y_valid))
	return model

In [0]:
def save_model(Masterdir,filename,model):
	"""
	Purpose -> Saves Keras model files to the given directory
	Input   -> Directory and experiment details to be saved and trained model file
	Output  -> Nil
	"""
	#Referred from:- http://keras.io/getting-started/faq/#how-can-i-save-a-keras-model
	model.save_weights(Masterdir+'Models/LSTM_'+filename+'_weights.h5')
	json_string = model.to_json()
	f = open(Masterdir+'Models/'+'LSTM_'+filename+'_architecture.json','w')
	f.write(json_string)
	f.close()

In [0]:
def get_activations(model, layer, X_batch):
	"""
	Purpose -> Obtains outputs from any layer in Keras
	Input   -> Trained model, layer from which output needs to be extracted & files to be given as input
	Output  -> Features from that layer 
	"""
	#Referred from:- TODO: Enter the forum link from where I got this
	get_activations = K.function([model.layers[0].input, K.learning_phase()], [model.layers[layer].output,])
	activations = get_activations([X_batch,0])
	return activations

In [0]:
def evaluate_model(X_test,y_test,model,batch_size,numclasses):
	"""
	Purpose -> Evaluate any model on the testing data
	Input   -> Testing data and labels, trained model and global variables
	Output  -> Nil
	"""
	#Convert y_test to one-hot encoding
	y_test = np_utils.to_categorical(y_test, numclasses)
	#Evaluate the accuracies
	score, acc = model.evaluate(X_test, y_test, batch_size=batch_size)
	print('Test score:', score)
	print('Test accuracy:', acc)

In [0]:
def save_data(Masterdir,filename,X_train,X_test,y_train,y_test,features_train,features_test):
	"""
	Purpose -> Saves train, test data along with labels and features in the respective directories in the folder
	Input   -> Train and test data, labels and features along with the directory and experiment details to be mentioned
	Output  -> Nil
	"""
	h5f = h5py.File(Masterdir+Datadir+'Xtrain_'+filename+'.h5', 'w')
	h5f.create_dataset('dataset', data=X_train)
	h5f.close()

	h5f = h5py.File(Masterdir+Datadir+'Xtest_'+filename+'.h5', 'w')
	h5f.create_dataset('dataset', data=X_test)
	h5f.close()

	output = open(Masterdir+Datadir+'Ytrain_'+filename+'.pkl', 'wb')
	pickle.dump([y_train], output)
	output.close()

	output = open(Masterdir+Datadir+'Ytest_'+filename+'.pkl', 'wb')
	pickle.dump([y_test], output)
	output.close()

	h5f = h5py.File(Masterdir+Featuredir+'features_train_'+filename+'.h5', 'w')
	h5f.create_dataset('dataset', data=features_train)
	h5f.close()

	h5f = h5py.File(Masterdir+Featuredir+'features_test_'+filename+'.h5', 'w')
	h5f.create_dataset('dataset', data=features_test)
	h5f.close()

In [15]:
if __name__ == '__main__':
	"""
	Master function
	"""
	print('Starting RNN Engine...\nModel: Char-level LSTM.\nParsing data files...')
	out = parse(Masterdir,inputdatasetfilename,SEPERATOR,DATA_COLUMN,LABEL_COLUMN,LABELS)
	X_train = out[0]
	y_train = out[1]
	print('Parsing complete!')

	print('Creating character dictionaries and format conversion in progess...')
	out = convert_char2num(mapping_num2char,mapping_char2num,X_train,MAXLEN)
	mapping_num2char = out[1]
	mapping_char2num = out[2]
	MAX_FEATURES = out[3]
	X_train = np.asarray(out[0])
	y_train = np.asarray(y_train).flatten()
	print('Complete!')
	
	print('Splitting data into train and test...')
	X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
	print('X_train shape:', X_train.shape)
	print('X_test shape:', X_test.shape)
	
	print('Creating LSTM Network...')
	model = RNN(deepcopy(X_train),deepcopy(y_train),[MAX_FEATURES, MAXLEN, embedding_size,\
			     filter_length, nb_filter, pool_length, lstm_output_size, batch_size, \
			     number_of_epochs, numclasses, test_size])

	print('Evaluating model...')
	evaluate_model(X_test,deepcopy(y_test),model,batch_size,numclasses)
	
	print('Feature extraction pipeline running...')
	activations = get_activations(model, 4, X_train)
	features_train = np.asarray(activations)
	activations = get_activations(model, 4, X_test)
	features_test = np.asarray(activations)
	print('Features extracted!')
	
	print('Saving experiment...')
	save_model(Masterdir,exp_details,model)
	save_data(Masterdir,exp_details,X_train,X_test,y_train,y_test,features_train,features_test)
	print('Saved! Experiment finished!')

Starting RNN Engine...
Model: Char-level LSTM.
Parsing data files...
Parsing complete!
Creating character dictionaries and format conversion in progess...
{'v': 0, 'm': 1, 'n': 2, 'i': 3, 'a': 4, 'j': 5, 'd': 6, 'z': 7, 'r': 8, 'y': 9, 'l': 10, 'b': 11, 'q': 12, 'w': 13, 'c': 14, 's': 15, 'o': 16, 'e': 17, 't': 18, 'g': 19, 'h': 20, 'x': 21, 'u': 22, 'k': 23, ' ': 24, 'f': 25, 'p': 26}
{0: 'v', 1: 'm', 2: 'n', 3: 'i', 4: 'a', 5: 'j', 6: 'd', 7: 'z', 8: 'r', 9: 'y', 10: 'l', 11: 'b', 12: 'q', 13: 'w', 14: 'c', 15: 's', 16: 'o', 17: 'e', 18: 't', 19: 'g', 20: 'h', 21: 'x', 22: 'u', 23: 'k', 24: ' ', 25: 'f', 26: 'p'}
Complete!
Splitting data into train and test...
X_train shape: (3103, 200)
X_test shape: (776, 200)
Creating LSTM Network...
Build model...





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.






Train...
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where







Train on 2482 samples, validate on 621 samples
Epoch 1/50





Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Evaluating model...
Test score: 0.7921601423283213
Test accuracy: 0.6842783505154639
Feature extraction pipeline running...
Features extracted!
Saving experiment...


NameError: ignored

In [18]:
print('Saving experiment...')
save_model(Masterdir,exp_details,model)
save_data(Masterdir,exp_details,X_train,X_test,y_train,y_test,features_train,features_test)
print('Saved! Experiment finished!')

Saving experiment...
Saved! Experiment finished!
