In [1]:
# Loading libraries

import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from PIL import Image
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.utils import resample
%matplotlib inline


Using TensorFlow backend.


In [2]:
df=pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [4]:
# Tokenize words in sentence

df['clean_text']=df['question_text'].apply(lambda x:[t for t in word_tokenize(x.lower())]  )

In [5]:
word_list=np.hstack(df.clean_text) # combine list in column clean_text as 1 big list for building embedding vector

In [6]:
len(word_list) # total words

18866937

In [7]:
len(np.unique(word_list)) # no of unique words

263186

## Embedding Vector Creation using Word2Vec Implementation
###  Build the dictionary (rare words replaced with UNK token), and give each word an index

In [9]:
vocabularyN = 200000   #The number of unique words we use

def build_dataset(words):
    count = [['UNK', -1]]
    # count will be an array of (word, frequency), count= [['a', 3], ['this', 4],...]

    # wordCount is an array with each with each element with its count  [['a', 3], ['this', 4],...]
    wordCount=Counter(words)
    print ('length of wordCount',len(wordCount))
    commonCount=wordCount.most_common(vocabularyN - 1)  #sort the array by counter, pick the top 49999 words
    print ('Most common words', commonCount[:4])
  
    count.extend(commonCount)  #Add the common words to 'count' array, len(count)=50000

    #Create dictionary for the most common words
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary) 
        #we increase the size of dictionary each time, the value is the index
    
    data = list()
    unk_count = 0
    for word in words:  #go through every word in current data
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count = unk_count + 1
        data.append(index)  
    #count.append(['UNK',unk_count])
  
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
    return data, count, dictionary, reverse_dictionary

In [10]:
data, count, dictionary, reverse_dictionary = build_dataset(word_list)

length of wordCount 263186
Most common words [('?', 1381192), ('the', 665368), ('what', 470692), ('is', 446267)]


### Define a function that generate a training data from each batch based on the skip-gram model.

In [15]:
import random
import collections

batchN = 60   #train_x size, the size for training data
num_skips = 2 # How many times to reuse an input to generate a label.
skip_window = 1 # How many words to consider left and right. in "2-gram" skip window=1
   #take maximum 3 words to the right and left

#Build training data for this batch
data_index = 0
def generate_training(batchN, num_skips, skip_window):
    global data_index  #This allows us to modify global variable inside a function
  
    assert batchN % num_skips == 0    #The batach size can be evenly divided by number of skips
    assert num_skips <= 2 * skip_window  #we don't do too may skips

    train_x = np.ndarray(shape=(batchN), dtype=np.int32)  #an array to hold training training data points
    labels = np.ndarray(shape=(batchN, 1), dtype=np.int32)

    """Read data of size span into buffer """ 
    ## Span is the total number of words, before + after+ current word
    span = 2 * skip_window + 1      
    buffer = collections.deque(maxlen=span) #an empty container with size "span" (3)

    #Get 3 (span) words from the data list
    for _ in range(span):  #do it 3 times (when span=3)
        buffer.append(data[data_index])
        #data_index keeps track of where the current center word (input) is
        data_index = data_index + 1 #update data_index
        #data_index = (data_index + 1) % len(data)  #u%len(data) is needed only when we have small data  

    """ The actual number of operation is batch_size // num_skips, where "//" is floor division, removing decimal numbers """ 
    #do it for half of the batch size, as we will do left and right  
    for i in range(batchN //num_skips):   # i=0,1,2,3,4 when batchN=10, and num_skips=2
        target = skip_window  # initialize the target index to be the center word, so that we can avoid it
        targets_to_avoid = [ skip_window ] #once chosen, we won't use it next time

        # for every word at target position, we create training data points 
        for j in range(num_skips):  #j=0,1
            while target in targets_to_avoid:  #if target still active, re-sample again
                target = random.randint(0, span - 1)  #Return a random integer 0<= N <= 2  (span -1) 
            #If the sampled index is not in the target_to_avoid  
            targets_to_avoid.append(target)  # we don't use target for training data now
            #when i=0, j=0, train_x[0] is update with the center word in buffer
            train_x[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target] 

        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return train_x, labels



###  Create a neural network with 100 nodes on the hidden layer, define the loss function



In [18]:
import math

embeddingN = 100 # Dimension of the embedding vector.

# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. 
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
# This creates an array of 16 elements, randomly selected from [0,100)
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 50  # this is an arbituary number

"""Defines a Neural network architecture"""
graph = tf.Graph()
with graph.as_default():
      # Input data.
    train_x = tf.placeholder(tf.int32, shape=[batchN])
    train_y = tf.placeholder(tf.int32, shape=[batchN, 1])
  
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
  
    # The first layer
    layer1_weights = tf.Variable(tf.random_uniform([vocabularyN, embeddingN], -1.0, 1.0))
    embed = tf.nn.embedding_lookup(layer1_weights, train_x)  #hidden layer node called "embed"
  
    # The second layer, which is also the output layer.  
    weights = tf.Variable( tf.truncated_normal([vocabularyN, embeddingN], stddev=1.0 / math.sqrt(embeddingN)))
    biases = tf.Variable(tf.zeros([vocabularyN]))

    # Compute the softmax of the output layer, then the loss, but do it a few samples, this reduces computation
    loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights, biases, train_y, embed, num_sampled, vocabularyN))

    # Optimizer.
    #optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
  
    # Compute the similarity between minibatch examples and all layer1_weights.
    # We use the cosine distance, which is the A*B/|A||B|  or A/|A| * B/|B|
    norm = tf.sqrt(tf.reduce_sum(tf.square(layer1_weights), 1, keep_dims=True))
    normalized_weights = layer1_weights / norm    #the dimension of this is 50,000x 128
  
    #look up the weights for the validataion data, 16 data points, each data point is position index
    # it returns an array of 16 elements, with each element has 128 dimensions
    valid_weights = tf.nn.embedding_lookup(normalized_weights, valid_dataset)
  
    #the dimension of valid_weights is 16x100, and the dimension of the 2nd argument is 100x20000
    similarity = tf.matmul(valid_weights, tf.transpose(normalized_weights))

print ('done')

done


###  Starts training by calling each batch

In [20]:
num_steps = 50000
with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print ("Initialized")
    average_loss = 0
  
    for step in range(num_steps):
        batch_data, batch_labels = generate_training(batchN, num_skips, skip_window)
        feed_dict = {train_x : batch_data, train_y : batch_labels}
                 
        _, l = session.run([optimizer, loss], feed_dict=feed_dict)
                 
        average_loss += l
    
        #The following is only for inspection 
        if step % 2000 == 0:
            if step > 0:
                average_loss = average_loss / 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print ("Average loss at step", step, ":", average_loss)
            average_loss = 0
            print ('step is ',step)
      
        # note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()  #the eval() re-run the variable "similarity" and updates its value
            if step > 0:
                print ('In 10000, step ',step)
        
            for i in range(valid_size):
                #print valid_examples[i]
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8 # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k+1]
                log = "Nearest to %s:" % valid_word
                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log = "%s %s," % (log, close_word)
                print (log)
    
    print ('before getting final embeddings')
    final_embeddings = normalized_weights.eval()
    print ('Training done')

Initialized
Average loss at step 0 : 9.18068408966
step is  0
Nearest to will: sucessfull, cesar, self-spreading, sene, holiness, pytorch, 132,000, cumbric,
Nearest to much: 9th, pasteurizer, viswanathan, intp-as, bhascaryacharya, ^4+, fullback, icelandic,
Nearest to think: irresistibly, reregister, platter, litterature, larynx, brfl, fame, mass+velocity,
Nearest to want: fashawn, candidature, mobilizer, deos, axworthy, gablé, whitespace, kalani,
Nearest to between: amore, mice, dfl, rudyard, abstraxi, aramoana, multi-fare, infundibulum,
Nearest to work: trireme, avert, iit-hyderabad, ti-nspire, non-baptized, solidifying, sucicide, harvards,
Nearest to n't: micra, prepflash, 0.347, physcially, zoning, india.do, dynasty, internatonal,
Nearest to 's: usi-teck, .fundamentals, verniers, neon-20, interx, kambakht, touraments, 32y/o,
Nearest to can: meninism, economics​, 3pls, jharia, tongan, icsce, speaker/chairman, liberalisation,
Nearest to a: iima, ident, glute/butt, 'lagna, cullen, 0.03

In [21]:
print (len(final_embeddings))
print(final_embeddings.shape)

200000
(200000, 100)


In [26]:
### Save final_embedding ,dictionary, reverse_dictionary
np.save('dictionary.npy', dictionary) 


In [27]:

np.savetxt('final_embeddings.txt', final_embeddings, fmt='%d')

In [28]:
def wordindex(x):
    if x in dictionary:
        return dictionary[x]
    else:
        return dictionary['UNK']

In [29]:
# Making sure words are all in dictionary

df['clean_text_idx']=df['clean_text'].apply(lambda x:[wordindex(t) for t in x] )

In [30]:
# Maximum sentence size

max(df.clean_text.apply(len))

412

In [31]:
df.shape[0]

1306122

In [32]:
# Upsamling minority class

df_majority = df[df.target==0]
df_minority = df[df.target==1]
 
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=1000000,    # to match majority class
                                 random_state=101) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.target.value_counts()

0    1225312
1    1000000
Name: target, dtype: int64

In [33]:
# Padding data to a fix size

data_pad=pad_sequences(df_upsampled['clean_text_idx'],maxlen=412)

In [34]:
data_pad.shape

(2225312, 412)

In [35]:
labels=df_upsampled['target']

In [36]:
labels=to_categorical(labels,num_classes=2)

In [37]:
np.savetxt('labels_word2vec_method.txt', labels, fmt='%d')

In [38]:
np.savetxt('data_pad_word2vec_method.txt', data_pad, fmt='%d')

## Model Creation

In [39]:
lstm_hidden_units=128  # no of hidden units in 1 lstm cell
num_classes=2
embedding_size=100  # embedding vector size
sentence_len=412  # max sentence length in dataset
vocab_size=200000  # Glove vocab size

In [45]:
def model():
    
    x=tf.placeholder(tf.int32,shape=[None,sentence_len])  # input place holder
    y=tf.placeholder(tf.float32,shape=[None,2]) # output place holder
    
    w=tf.Variable(tf.random_normal([lstm_hidden_units,num_classes]))
    b=tf.Variable(tf.constant(0.1,shape=[num_classes]))
        
    Embedding = tf.get_variable(name="word_embedding", shape=[final_embeddings.shape[0],embedding_size],
                                                       initializer=tf.constant_initializer(final_embeddings),
                                                       trainable=False) # loading embedding matrix                               
    embed_lookup=tf.nn.embedding_lookup(Embedding,x) # batch_size x sentence_length x embedding_size
    
    lstm_cell=tf.contrib.rnn.BasicLSTMCell(lstm_hidden_units) # create LSTM layer
    current_batch_size=tf.shape(x)[0]
    initial_state=lstm_cell.zero_state(current_batch_size,dtype=tf.float32) # state initialisations

    outputs, _ =tf.nn.dynamic_rnn(lstm_cell,embed_lookup,initial_state=initial_state,dtype=tf.float32)#batch_size x sen_length x hidden_units 
    outputs=tf.transpose(outputs,[1,0,2]) #sentence_length x batch_size x hidden_units
    last=tf.gather(outputs,int(outputs.get_shape()[0])-1)  # # batch_size x hidden_units
    
    predictions=tf.matmul(last,w)+b # batch_size x 2
    correct_predictions=tf.equal(tf.argmax(tf.nn.sigmoid(predictions),axis=1),tf.argmax(y,axis=1))
    accuracy=tf.reduce_mean(tf.cast(correct_predictions,tf.float32))     
    loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=predictions,labels=y))
    optimizer=tf.train.AdamOptimizer(learning_rate=0.005).minimize(loss)
    
    return optimizer,loss,x,y,accuracy,predictions, correct_predictions

In [41]:
X_train,X_test, y_train,y_test=train_test_split(data_pad,labels,test_size=0.3,random_state=101)

In [46]:
tf.reset_default_graph()
optimizer,loss,x,y,accuracy,predictions, correct_predictions=model()

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [47]:
batch_size=32
num_batches=len(X_train)//batch_size


In [48]:
with tf.Session() as sesh:
    init=tf.group(tf.global_variables_initializer(),tf.local_variables_initializer())
    sesh.run(init)    
    saver = tf.train.Saver()
    writer= tf.summary.FileWriter("logdir/", graph = sesh.graph)    
    test_accuracy=[]
    test_p=[]
    test_r=[]
    for i in range(num_batches):
        if i !=num_batches-1:
            x_batch=X_train[i*batch_size:i*batch_size+batch_size]
            y_batch=y_train[i*batch_size:i*batch_size+batch_size]
        else:
            x_batch=X_train[i*batch_size:]
            y_batch=y_train[i*batch_size:]        
        
        _, l, a=sesh.run([optimizer,loss,accuracy],feed_dict={x:x_batch,y:y_batch})
               
        if i>0 and i % 500==0:
             # Randomly setting testing data to see performance when training
            rand_idx = np.random.choice(np.arange(len(X_test)),200, replace=False)
            test_x = X_test[rand_idx]
            test_y = y_test[rand_idx]
            t_l, t_a=sesh.run([loss,accuracy],feed_dict={x:test_x,y:test_y})
            test_accuracy.append(t_a)           
            print("Step",i,"of", num_batches,"loss",l,"accuracy",a)
            print("Test loss", t_l,"accuracy",t_a,)            
        
    print("Average accuracy", np.mean(test_accuracy)) 
    saver.save(sesh, "logdir\\lstm_model.ckpt")
    writer.flush()
    writer.close()
            

Step 500 of 48678 loss 0.432214 accuracy 0.78125
Test loss 0.398796 accuracy 0.8
Step 1000 of 48678 loss 0.403539 accuracy 0.8125
Test loss 0.385915 accuracy 0.825
Step 1500 of 48678 loss 0.352263 accuracy 0.8125
Test loss 0.468525 accuracy 0.795
Step 2000 of 48678 loss 0.61345 accuracy 0.6875
Test loss 0.350355 accuracy 0.87
Step 2500 of 48678 loss 0.223349 accuracy 0.9375
Test loss 0.424124 accuracy 0.81
Step 3000 of 48678 loss 0.236677 accuracy 0.9375
Test loss 0.381306 accuracy 0.82
Step 3500 of 48678 loss 0.290952 accuracy 0.90625
Test loss 0.434748 accuracy 0.815
Step 4000 of 48678 loss 0.352875 accuracy 0.84375
Test loss 0.388232 accuracy 0.84
Step 4500 of 48678 loss 0.266288 accuracy 0.84375
Test loss 0.353286 accuracy 0.855
Step 5000 of 48678 loss 0.34254 accuracy 0.84375
Test loss 0.320817 accuracy 0.855
Step 5500 of 48678 loss 0.382165 accuracy 0.8125
Test loss 0.313459 accuracy 0.885
Step 6000 of 48678 loss 0.273566 accuracy 0.9375
Test loss 0.279896 accuracy 0.895
Step 650