In [1]:
import nltk
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.svm import LinearSVC
from gensim.models import KeyedVectors
from tqdm import tqdm

In [3]:
sentiment_data = pd.read_csv("./train.csv")

In [4]:
DATA_TO_USE = 250000

texts = sentiment_data['message'].tolist()[:DATA_TO_USE]
labels = np.array(sentiment_data['sentiment'])[:DATA_TO_USE]

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, random_state=21)

## Exercise 3: word vectors meet bag of words

In this exercise you use your newly trained word vectors and a simple Bag of Words models to approach the sentiment analysis task

In [None]:
# We will use a convinient wrapper for our word2vec model provided by gensim

In [7]:
w2v_model = KeyedVectors.load_word2vec_format("/home/igel/Downloads/simple_cbow.w2v")

In [13]:
# you can get the vector for a word in a simple way
print(len(w2v_model['word']))
w2v_model['word']

128


array([ 0.02990426, -0.15916565, -0.11447234, -0.05986521, -0.12276778,
       -0.16413523, -0.11490295,  0.02804964, -0.00436324,  0.01494077,
       -0.01524218,  0.1097708 , -0.06026166, -0.04513515, -0.00505505,
        0.09611265,  0.07788374,  0.08366123, -0.01655415,  0.12933229,
       -0.03674443, -0.19012986,  0.05885392,  0.06013624,  0.0801625 ,
        0.03977867,  0.00211081,  0.06386852, -0.03472841,  0.15342017,
        0.05833793, -0.05921539, -0.05952489, -0.06266541, -0.08161964,
        0.12462132, -0.00591985, -0.15344228,  0.12336656,  0.03647648,
        0.05483676,  0.06484858, -0.07936434, -0.09052481,  0.03851042,
        0.02385422, -0.05429959,  0.07028124,  0.04387809, -0.11558681,
        0.05841981, -0.12859698, -0.0077426 ,  0.19667299,  0.02155624,
       -0.02183611,  0.10042851, -0.0544624 , -0.00891632,  0.2033658 ,
       -0.013372  ,  0.06375191, -0.11004724, -0.15569478, -0.08661045,
        0.00989923, -0.08449263, -0.1268454 , -0.09556618,  0.05

gensim.models.keyedvectors.Word2VecKeyedVectors

In [15]:
# you can easily query the model for word most similar to a give word 
w2v_model.most_similar('a')

  if np.issubdtype(vec.dtype, np.int):


[('another', 0.4291095733642578),
 ('the', 0.38493815064430237),
 ('any', 0.37689319252967834),
 ('coffey', 0.34323281049728394),
 ('jacqueline', 0.34044143557548523),
 ('kneeling', 0.3361562192440033),
 ('mackay', 0.32636597752571106),
 ('stresemann', 0.31539812684059143),
 ('histone', 0.31503355503082275),
 ('monaghan', 0.3137550354003906)]

### 3.1

In this exercise you learn how to encode sentences with word2vec using a bag of words approach

In [11]:
# implement a tokenizer that you will use throughout the exercise
# I would recommend a regexp tokenizer for speed, but it's completely up to you
def my_tokenizer(text):
    return nltk.regexp_tokenize(text, '\w+')

In [29]:
def bow_encoder(wmodel, tokenizer, text):
    """
    This function encodes text into a vector.
    
    First, it tokenizes input text using the provided tokenizer function.
    Then it uses the provided word2vec model to get the vectors corresponding to text's tokens.
    Finally, it computes an average of all token's vectors and returns it.
    
    If the function failed to find and encode any words, it should at least return a vector of zeros.
    """
    zero_vector = np.zeros(w2v_model.vector_size)
    # your code goes here
#     sent_vector = [wmodel.get(token, zero_vector) for token in tokenized_text]
    tokens = tokenizer(text)
    word_vectors = [wmodel[t] for t in tokens if t in wmodel]
    if len(word_vectors):
        sent_vector = np.mean(word_vectors, axis=0)
    else:
        sent_vector = np.zeros(w2v_model.vector_size)
    return sent_vector

Now use your new encoder to encode both train_texts and test_texts into matrices.

The number of rows in a matrix should be equal to the number of texts encoded.

The number of columns should be equal to the word2vec space dimansionality (currently = 128)

Just write a little loop.

In [21]:
def encode_sentence_with_bow(sentences, model, tokenizer):
    return np.array([bow_encoder(model, tokenizer, text) for text in tqdm(sentences)])

In [22]:
train_encoded =  encode_sentence_with_bow(train_texts, w2v_model, my_tokenizer)
test_encoded  =  encode_sentence_with_bow(test_texts, w2v_model, my_tokenizer)

100%|██████████| 187500/187500 [00:11<00:00, 16523.28it/s]
100%|██████████| 62500/62500 [00:03<00:00, 17374.45it/s]


In [23]:
assert isinstance(train_encoded, np.ndarray)
assert isinstance(test_encoded, np.ndarray)

assert train_encoded.shape[0] == len(train_texts)
assert train_encoded.shape[1] == w2v_model.vector_size

assert test_encoded.shape[0] == len(test_texts)
assert test_encoded.shape[1] == w2v_model.vector_size
print('done')

done


In [24]:
train_labels[:56]

array([1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0])

In [26]:
print(train_encoded.shape)
train_encoded[0]

(187500, 128)


array([-0.02896139,  0.00537474, -0.01987924,  0.03310338,  0.00644111,
        0.06600953,  0.01550328, -0.03566363, -0.02510163, -0.01117753,
        0.00322492,  0.04902239,  0.01001703,  0.05570917,  0.03689624,
       -0.01485302, -0.02157233, -0.00184187,  0.02044139, -0.0059447 ,
       -0.01276293, -0.07878712,  0.05575456, -0.0461246 , -0.00306651,
        0.00723166, -0.01750634,  0.04136015,  0.00466379,  0.0240885 ,
       -0.00868364,  0.00425283, -0.03041368,  0.02261088,  0.04232081,
       -0.04063541,  0.00036504, -0.02799336, -0.01585136, -0.01942656,
        0.00216601,  0.00431208, -0.01391252, -0.00474886,  0.05443996,
        0.01660595, -0.03581998,  0.01215116,  0.03361015, -0.03727384,
       -0.00317842,  0.00865125, -0.02583007,  0.03629126,  0.04480203,
       -0.01947688,  0.05272445, -0.00338176,  0.0287633 , -0.02610079,
        0.02351554,  0.04307071, -0.02624363, -0.02532157,  0.03658989,
        0.0527278 , -0.01743673, -0.04904271, -0.0402409 , -0.00

In [27]:
clf = LinearSVC()
clf.fit(train_encoded, train_labels)
preds = clf.predict(test_encoded)

print(classification_report(test_labels, preds))
print("AUC = {}".format(roc_auc_score(test_labels, preds)))

             precision    recall  f1-score   support

          0       0.63      0.64      0.63     31237
          1       0.63      0.62      0.63     31263

avg / total       0.63      0.63      0.63     62500

AUC = 0.6306270643977973


### Not too impressive!

### 3.2

In this exercise you attempt to improve your encoder by filtering out stop words.

In [31]:
def bow_encoder_with_stopwords(wmodel, tokenizer, stopwords, text):
    """
    This function encodes text into a vector.
    
    First, it tokenizes input text using the provided tokenizer function.
    Then it removes any stopwords from the list of tokens.
    Then it uses the provided word2vec model to get the vectors corresponding to text's tokens.
    Finally, it computes an average of all token's vectors and returns it.
    
    If the function failed to find and encode any words, it should at least return a vector of zeros.
    """
    zero_vector = np.zeros(w2v_model.vector_size)
    # your code goes here
    tokens_ = tokenizer(text)
    tokens  = [token for token in tokens_ if (token in wmodel and token.lower() not in stopwords)]
    word_vectors = [wmodel[t] for t in tokens if t in wmodel]
    if len(word_vectors):
        sent_vector = np.mean(word_vectors, axis=0)
    else:
        sent_vector = np.zeros(w2v_model.vector_size)
 
    return sent_vector

In [33]:
def encode_sentence_with_bow_stop(sentences, model, tokenizer):
    return np.array([bow_encoder_with_stopwords(model, tokenizer, stops, text) for text in tqdm(sentences)])

In [32]:
stops = set(nltk.corpus.stopwords.words('english'))

In [34]:
train_encoded_ =  encode_sentence_with_bow_stop(train_texts, w2v_model, my_tokenizer)
test_encoded_  =  encode_sentence_with_bow_stop(test_texts, w2v_model, my_tokenizer)

100%|██████████| 187500/187500 [00:09<00:00, 19147.96it/s]
100%|██████████| 62500/62500 [00:03<00:00, 19150.76it/s]


In [36]:
clf = LinearSVC()
clf.fit(train_encoded_, train_labels)
preds = clf.predict(test_encoded_)

print(classification_report(test_labels, preds))
print("AUC = {}".format(roc_auc_score(test_labels, preds)))

             precision    recall  f1-score   support

          0       0.61      0.62      0.61     31237
          1       0.61      0.61      0.61     31263

avg / total       0.61      0.61      0.61     62500

AUC = 0.6110897364727454


In [37]:
from sklearn.metrics import accuracy_score
accuracy_score(test_labels, preds)

0.611088

## Looks like the BoW model is not too good for the job!

![architecture](pics/we_need_to_go_deeper.jpg)

## Introducing: Keras

Keras is a cool library built on top of the computational backend provided by Tensorflow. It provides a layer of abstraction between you and complicated tensor algebra, allowing for rapid prototyping of deep neural networks.

### 3.3: Data preparation

Before we start crunching word vectors with convolutional neural networks, we need to prepare our data.

In [38]:
import keras

Using TensorFlow backend.


In [40]:
# load the vocabulary we created earlier
voc, rvoc = pickle.load(open("./dict_rdict.pkl","rb"))

In [46]:
rvoc[179]
voc['set']

179

In [47]:
# we are going to use the whole dataset this time around
texts = sentiment_data['message'].tolist()
labels = np.array(sentiment_data['sentiment'])

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, random_state=21)

Recall the function that turns tokens into their ids

In [50]:
# hint: you may want to use the function you've built during seminar 2
MAX_LEN = 32

def vectorize_tokens(sentence, tokenizer, token_to_id, max_len):
    """
    Preprocesses a sentence into list of tokens using the provided tokenizer
    Then converts it into a list of token ids using the supplied 'token_to_id' dictionary.
    Pads resulting list with NULL identifiers up to max_len length. 
    """
    # your code goes here
    # STEP 1: convert sentence to a list of tokens
    tokens = tokenizer(sentence)
    # STEP 2: replace tokens with their identifiers from the vocabulary
    # If the token is not present in the vocabulary, replace it with UNKN identifier
    ids = [token_to_id.get(token, token_to_id['UNKN']) for token in tokens]    

    # STEP 3: pad the sequence id's with NULL identifiers until so that it's length is equal to max_len
#     while len(ids) < max_len:
#         ids.append(token_to_id['NULL'])

    if len(ids) > max_len:
        ids = ids[:max_len]
    elif len(ids) < max_len:
        miss_count = max_len - len(ids)
        ids.extend([token_to_id['NULL']]*miss_count)
        
    return ids

In [53]:
vectorize_tokens('the main difference is that KeyedVectors do not support further training. On the other',
                 my_tokenizer, voc, MAX_LEN)

[2,
 228,
 1301,
 12,
 21,
 1,
 177,
 39,
 316,
 368,
 1315,
 1,
 2,
 44,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

Now apply the vectorization function to every sentence from train and test datasets. In the end you should end up with a matrix of shape [len(data), MAX_LEN].

Just write a little loop

In [57]:
def vectorize_sentences(sentences, tokenizer, token_to_id, max_len):
    sentence_ids = []
    
    # your code goes here
    sentence_ids = [vectorize_tokens(sentence, tokenizer, token_to_id, max_len) for sentence in tqdm(sentences)]
        
    return np.array(sentence_ids)

In [58]:
train_vectorized = vectorize_sentences(train_texts, my_tokenizer, voc, MAX_LEN)
test_vectorized = vectorize_sentences(test_texts, my_tokenizer, voc, MAX_LEN)

100%|██████████| 937500/937500 [00:13<00:00, 70878.23it/s]
100%|██████████| 312500/312500 [00:04<00:00, 70312.17it/s]


In [60]:
train_vectorized.shape

(937500, 32)

In [59]:
assert isinstance(train_vectorized, np.ndarray)
assert isinstance(test_vectorized, np.ndarray)

assert train_vectorized.shape == (len(train_vectorized), MAX_LEN)
assert test_vectorized.shape == (len(test_vectorized), MAX_LEN)

print('done')

done


### 3.4 Building a deep NN

In [61]:
embeddings_matrix = w2v_model.syn0

  """Entry point for launching an IPython kernel.


In [64]:
embeddings_matrix.shape

(50000, 128)

In [65]:
# keras Input layer is basically the same thing as tf.placeholder
# it defines a node where the network will be expecting to recieve input data
input_layer = keras.layers.Input(shape=(MAX_LEN,))

In [66]:
# keras Embedding layer is a container for dense vectors
# it recieves a list of token identifiers of shape [MAX_LEN] 
# and turns it into a matrix of shape [MAX_LEN, EMBEDDING_DIM]

embedding_layer = keras.layers.Embedding(embeddings_matrix.shape[0], embeddings_matrix.shape[1], 
                                         input_length=MAX_LEN, weights=[embeddings_matrix],
                                         trainable=False)(input_layer)
# notice how the input_layer is plugged into the embedding_layer

In [67]:
# keras Convolutional layer implements a set of learnable filters
# that extract local patterns from input data
convolution_layer = keras.layers.Convolution1D(128, 3)(embedding_layer)

In [68]:
# keras GlobalMaxPooling layer applies a max filter to the input feature representation
# only the strongest responses from the previous layer are kept, everything else is discarded
subsampling_layer = keras.layers.GlobalMaxPooling1D()(convolution_layer)

In [69]:
# keras Linear layers apply a simple linear transformation to input data, 
# which is optionally followed by a non-linear activation function
# very useful for building Multi-Layer Perceptrons
linear_layer_1 = keras.layers.Dense(64, activation='relu')(subsampling_layer)
linear_layer_2 = keras.layers.Dense(1, activation='sigmoid')(linear_layer_1)

In [70]:
# this compiles the computational graph we've just created, applies a loss function
# and pre-computes the gradients for back propagation

deep_model = keras.models.Model(inputs=[input_layer], outputs=[linear_layer_2])
deep_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [71]:
deep_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 32)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 32, 128)           6400000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 30, 128)           49280     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 6,457,601
Trainable params: 57,601
Non-trainable params: 6,400,000
_____________________________________________________________

In [72]:
deep_model.fit(x=train_vectorized, y=train_labels, batch_size=64, epochs=3, 
               validation_data=[test_vectorized, test_labels])

Train on 937500 samples, validate on 312500 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f2e8a292908>

In [73]:
preds = deep_model.predict(test_vectorized)

In [74]:
print("AUC = {}".format(roc_auc_score(test_labels, preds)))

AUC = 0.8503456596197503


In [75]:
deep_model.save_weights("nn_weights.hdf5")

### Thats more like it! Keep in mind that we only trained a tiny model because of the limitations of CPU computing power. Using a deeper model with more trainable filters in the Convolution layer would likely result in even stronger predictive power. Stay tuned! 