# Import & Load

In [1]:
%matplotlib inline
import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer

import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import time
import params

In [2]:
DATASET = np.load("data/dataset.npy")
print "Shape :", DATASET.shape

Shape : (6126219, 161, 1)


# Preprocessing function

In [3]:
def batch2onehot(batch, D): # see preprocessing.py
    ''' Function used during the training to encode batches.
    Input size : (batch_size, tweet_length, 1).
    Output size : (batch_size, tweet_length, D)'''
    B, T = batch.shape[0:2]
    one_hot_batch = np.zeros((B*T, D))
    one_hot_batch[range(B*T), batch.flatten()] = 1
    one_hot_batch = one_hot_batch.reshape((B,T,D))
    return one_hot_batch

def batch2tweet(batch, accepted_caracters, special_char=""): # see preprocessing.py
    '''Not optimized. But not used during the training : no need to be fast.'''
    tweets = []
    for t in batch:
        tweet = ""
        for char in t:
            try:
                tweet += accepted_caracters[char[0]]
            except:
                tweet += special_char # Special marker indicating the end of the tweet
        tweets.append(tweet)
    return tweets

def onehot2tweet(batch, accepted_caracters, special_char=""): # see preprocessing.py
    '''Not optimized. But not used during the training : no need to be fast.'''
    tweets = []
    for t in batch:
        tweet = ""
        for char in t:
            try:
                tweet += accepted_caracters[np.where(char==1)[0][0]]
            except:
                tweet += special_char # Special marker indicating the end of the tweet
        tweets.append(tweet)
    return tweets

# Model definition

In [4]:
from keras.models import Model
from keras.layers import Input, LSTM, Masking, Dropout, TimeDistributed, Dense, Activation
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

Using Theano backend.


In [5]:
def get_model_truncated2(T, D, lr, nhidden, drop_rate, nb_neuron_hidden): # see models.py
    # Input layer
    inputs = Input((T, D))
    # Masking "only-0" input features
    masked = Masking(mask_value=0.0)(inputs)
    # Hidden layers
    for i in range(nhidden):
        if i == (nhidden-1):
            outputs = LSTM(nb_neuron_hidden, return_sequences=True)(dropout)
        elif i == 0:
            hidden  = LSTM(nb_neuron_hidden, return_sequences=True)(masked)
        else:
            hidden  = LSTM(nb_neuron_hidden, return_sequences=True)(dropout)
        dropout = Dropout(drop_rate)(hidden)

    model = Model(input=inputs, output=outputs)

    model.compile(optimizer=Adam(lr=lr), loss="categorical_crossentropy")

    return model

In [6]:
T = 161
D = 64
LR = params.LR # learning rate
model = get_model_truncated2(T-1, D-1, LR, 2, 0.1, 512) # D-1 because params.D accounts for the padding dimension

In [7]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 160, 63)       0                                            
____________________________________________________________________________________________________
masking_1 (Masking)              (None, 160, 63)       0           input_1[0][0]                    
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 160, 512)      1179648     masking_1[0][0]                  
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 160, 512)      0           lstm_1[0][0]                     
___________________________________________________________________________________________

In [8]:
# load the weights
weights = np.load('results/exp_014/weights/best_model.npy')
#model.set_weights(weights)
model.layers[2].set_weights(weights[0:12])
model.layers[4].set_weights(weights[12:24])

# Word categorization

In [9]:
ACCEPTED_CHARACTERS = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 
                       'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] + \
['%d'%i for i in range(10)] + \
['/', '@', '#', '&', '|', '|', '|', '|', '|', '|', '|'] + \
['\n', ' ', '!', '"', "'", '(', ')', '*', ',', '-', '.', ':', ';', '?', '_','|']

ACCEPTED_CHARACTERS.sort()
ACCEPTED_CHARACTERS.append('')
accepted_characters = ACCEPTED_CHARACTERS

In [10]:
def tweet2string(tweet, accepted_characters = ACCEPTED_CHARACTERS):
    output = ''
    for t in tweet:
        output = output + accepted_characters[t[0]]
    return output

In [11]:
# adapted from http://stackoverflow.com/questions/14776317/finding-exact-position-of-tokenized-sentences
from nltk import tokenize
from nltk import TweetTokenizer

def token_position(tweet, tags):
    tknzr = TweetTokenizer()
    
    output = []
    offset, length, offset2,length = 0, 0, 0, 0
    for i,sentence in enumerate(tknzr.tokenize(tweet)):
        # fix ignored characters
        offset = tweet.find(sentence, offset)
        length = len(sentence)
        
        output.append([offset, length, tags[i]])
        offset += length
    return output

In [12]:
def categorize_words(tweet):
    tknzr = TweetTokenizer()

    token = tknzr.tokenize(tweet)
    tag = nltk.pos_tag(token)
    tweet_tokenized = token_position(tweet, tag)
    tweet_tokenized

    word_position = 0
    tags = []

    for i, c in enumerate(tweet):

        if i > (tweet_tokenized[word_position][0] + tweet_tokenized[word_position][1]):
            word_position = word_position + 1

        if i != (tweet_tokenized[word_position][0] + tweet_tokenized[word_position][1]) and i>=tweet_tokenized[word_position][0]:
            tags.append(tweet_tokenized[word_position][2][1])
        else:
            tags.append('.')
    return tags

In [13]:
# categorize words
n = 150
tweets = DATASET[0:n]

tweet_syntax = []
for t in tweets:
    tweet_str = tweet2string(t, ACCEPTED_CHARACTERS)
    tweet_syntax.append(categorize_words(tweet_str))

In [14]:
# get the model prediction
one_hot_batch = batch2onehot(tweets, D)
one_hot_batch = one_hot_batch[:,0:160,0:(D-1)]

out = model.predict(one_hot_batch)
out.shape

(150, 160, 512)

In [15]:
# for all tweets, seperate the words and get their categories
word = []
word_syntax = []
    
for t in range(out.shape[0]):
    tmp_word = []
    on = False

    last_c = ''
    for i, c in enumerate(tweet_syntax[t]):
        if last_c != c and on:
            word.append(np.asarray(tmp_word))
            word_syntax.append(last_c)
            tmp_word = []
            on = False
        elif last_c != c and not on:
            on = True

        if c == '.':
            on = False
        else:
            tmp_word.append(out[t][i])
        last_c = c

word = np.asarray(word)        

In [16]:
# get gram matrices for each word
gram = []
for w in word:
    gram.append(np.dot(np.transpose(w),w))
    
gram = np.asarray(gram)

In [17]:
gram.shape

(2098, 512, 512)

In [18]:
out2 = np.reshape(gram, (gram.shape[0], (gram.shape[1]*gram.shape[2])))
out2 = np.transpose(out2)
out2.shape

(262144, 2098)

In [None]:
from sklearn.decomposition import PCA

# PCA from 10000 to 100 dimensions
pca = PCA(n_components = 100)
pca.fit(out2)
pca.components_.shape

In [None]:
components = np.transpose(pca.components_)

In [None]:
from sklearn.manifold import TSNE

tsne_model = TSNE(n_components=2, random_state = 0)
data = tsne_model.fit_transform(components)

In [None]:
category = ['?']
colors = []

for w in word_syntax:
    if w[0] not in category:
        if w[0].isalpha():
            category.append(w[0])
            
    if w[0].isalpha():
        #>print w[0]
        colors.append(category.index(w[0]))
    else:
        colors.append(category.index('?'))

#colors

In [None]:
colors = np.array(colors)


for i in range(len(category)):
    c = np.where(colors == i)
    plt.scatter(data[c,0], data[c,1], c=np.random.rand(3,1), label = category[i])
plt.legend(loc='center left', bbox_to_anchor = (1,0.5))
plt.show()