In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import itertools
import operator
from utils import *
import sys
from datetime import datetime

In [2]:
data = pd.read_csv('iranian_tweets.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1122936 entries, 0 to 1122935
Data columns (total 31 columns):
tweetid                     1122936 non-null int64
userid                      1122936 non-null object
user_display_name           1122936 non-null object
user_screen_name            1122936 non-null object
user_reported_location      887669 non-null object
user_profile_description    995845 non-null object
user_profile_url            434954 non-null object
follower_count              1122936 non-null int64
following_count             1122936 non-null int64
account_creation_date       1122936 non-null object
account_language            1122936 non-null object
tweet_language              1117307 non-null object
tweet_text                  1122936 non-null object
tweet_time                  1122936 non-null object
tweet_client_name           1100078 non-null object
in_reply_to_tweetid         339350 non-null float64
in_reply_to_userid          440244 non-null object
quoted_twe

In [4]:
df = data[['userid','tweet_language','tweet_text','is_retweet']]

In [5]:
df = df[(df.is_retweet==False)&(df.tweet_language=='en')]

In [6]:
unknown_token = "UNKNOWNTOKEN"
sentence_start_token = "SENTENCESTART"
sentence_end_token = "SENTENCEEND"

In [7]:
def sentence_tokenize(tweet):
    return ['{} {} {}'.format(sentence_start_token,sentence,sentence_end_token) for sentence in nltk.sent_tokenize(tweet)]

In [8]:
df['sentences'] = df['tweet_text'].apply(lambda x: sentence_tokenize(x))

In [9]:
df.sentences.values[0:3]

array([list(['SENTENCESTART @ParkerLampe An inquiry by congress confirms that ISIS is indeed a CIA creation http://t.co/eFRmFwYZTV SENTENCEEND']),
       list(['SENTENCESTART @hadeelhmaidi @wordpressdotcom CIA predict third terrorist attack after Sidney and Pakiistan in USA in 3 days http://t.co/IrPx7M223N SENTENCEEND']),
       list(['SENTENCESTART @irfhabib why boko haram come europe,legally and easily? SENTENCEEND', 'SENTENCESTART http://t.co/on2vzPqEPH SENTENCEEND'])],
      dtype=object)

In [10]:
df['tokens'] = df.sentences.apply(lambda x: [nltk.word_tokenize(i) for i in x])

In [11]:
df.tokens.values[0:3]

array([list([['SENTENCESTART', '@', 'ParkerLampe', 'An', 'inquiry', 'by', 'congress', 'confirms', 'that', 'ISIS', 'is', 'indeed', 'a', 'CIA', 'creation', 'http', ':', '//t.co/eFRmFwYZTV', 'SENTENCEEND']]),
       list([['SENTENCESTART', '@', 'hadeelhmaidi', '@', 'wordpressdotcom', 'CIA', 'predict', 'third', 'terrorist', 'attack', 'after', 'Sidney', 'and', 'Pakiistan', 'in', 'USA', 'in', '3', 'days', 'http', ':', '//t.co/IrPx7M223N', 'SENTENCEEND']]),
       list([['SENTENCESTART', '@', 'irfhabib', 'why', 'boko', 'haram', 'come', 'europe', ',', 'legally', 'and', 'easily', '?', 'SENTENCEEND'], ['SENTENCESTART', 'http', ':', '//t.co/on2vzPqEPH', 'SENTENCEEND']])],
      dtype=object)

In [12]:
def concat_sentences(sentences):
    output = []
    for sentence in sentences:
        for i in sentence:
            output.append(i)
    return output

In [13]:
df['tokens1'] = df.tokens.apply(lambda x: concat_sentences(x))

In [14]:
df.tokens1.values[0:3]

array([list(['SENTENCESTART', '@', 'ParkerLampe', 'An', 'inquiry', 'by', 'congress', 'confirms', 'that', 'ISIS', 'is', 'indeed', 'a', 'CIA', 'creation', 'http', ':', '//t.co/eFRmFwYZTV', 'SENTENCEEND']),
       list(['SENTENCESTART', '@', 'hadeelhmaidi', '@', 'wordpressdotcom', 'CIA', 'predict', 'third', 'terrorist', 'attack', 'after', 'Sidney', 'and', 'Pakiistan', 'in', 'USA', 'in', '3', 'days', 'http', ':', '//t.co/IrPx7M223N', 'SENTENCEEND']),
       list(['SENTENCESTART', '@', 'irfhabib', 'why', 'boko', 'haram', 'come', 'europe', ',', 'legally', 'and', 'easily', '?', 'SENTENCEEND', 'SENTENCESTART', 'http', ':', '//t.co/on2vzPqEPH', 'SENTENCEEND'])],
      dtype=object)

In [15]:
word_freqs = nltk.FreqDist(itertools.chain(* df.tokens1.values))

In [16]:
word_freqs.most_common

<bound method Counter.most_common of FreqDist({':': 247742, 'SENTENCESTART': 217886, 'SENTENCEEND': 217886, '#': 188963, '@': 115497, 'https': 99736, 'http': 86404, 'to': 58554, 'the': 55400, 'in': 52264, ...})>

In [17]:
vocab_size = 6000

In [23]:
vocab = word_freqs.most_common(vocab_size-1)

In [24]:
index_to_word = [x[0] for x in vocab]

In [25]:
index_to_word.append(unknown_token)

In [30]:
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])


In [33]:
df['replaced_tokens'] = df.tokens1.apply(lambda x: [word if word in index_to_word else unknown_token for word in x])

In [35]:
df.replaced_tokens.values[0:6]

array([list(['SENTENCESTART', '@', 'UNKNOWNTOKEN', 'An', 'inquiry', 'by', 'congress', 'confirms', 'that', 'ISIS', 'is', 'indeed', 'a', 'CIA', 'creation', 'http', ':', '//t.co/eFRmFwYZTV', 'SENTENCEEND']),
       list(['SENTENCESTART', '@', 'UNKNOWNTOKEN', '@', 'UNKNOWNTOKEN', 'CIA', 'predict', 'third', 'terrorist', 'attack', 'after', 'Sidney', 'and', 'Pakiistan', 'in', 'USA', 'in', '3', 'days', 'http', ':', '//t.co/IrPx7M223N', 'SENTENCEEND']),
       list(['SENTENCESTART', '@', 'UNKNOWNTOKEN', 'why', 'boko', 'haram', 'come', 'europe', ',', 'legally', 'and', 'easily', '?', 'SENTENCEEND', 'SENTENCESTART', 'http', ':', '//t.co/on2vzPqEPH', 'SENTENCEEND']),
       list(['SENTENCESTART', '@', 'UNKNOWNTOKEN', 'ISIS', 'militants', ',', 'plan', 'to', 'target', 'Western', 'capitals', 'http', ':', '//t.co/890VDVPE6o', 'SENTENCEEND']),
       list(['SENTENCESTART', '@', 'UNKNOWNTOKEN', 'Turkish', 'intelligence', 'chief', ':', 'ISIS', 'is', 'a', 'reality', 'and', 'we', 'are', 'optimistic', 'about

In [36]:
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in df.replaced_tokens.values])

In [38]:
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in df.replaced_tokens.values])

In [41]:
class RNNNumpy:
    
    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
        # Assign instance variables
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        # Randomly initialize the network parameters
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))

In [49]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

In [50]:
def forward_propagation(self, x):
    # The total number of time steps
    T = len(x)
    # During forward propagation we save all hidden states in s because need them later.
    # We add one additional element for the initial hidden, which we set to 0
    s = np.zeros((T + 1, self.hidden_dim))
    s[-1] = np.zeros(self.hidden_dim)
    # The outputs at each time step. Again, we save them for later.
    o = np.zeros((T, self.word_dim))
    # For each time step...
    for t in np.arange(T):
        # Note that we are indxing U by x[t]. This is the same as multiplying U with a one-hot vector.
        s[t] = np.tanh(self.U[:,x[t]] + self.W.dot(s[t-1]))
        o[t] = softmax(self.V.dot(s[t]))
    return [o, s]


In [51]:
RNNNumpy.forward_propagation = forward_propagation

In [52]:
def predict(self, x):
    # Perform forward propagation and return index of the highest score
    o, s = self.forward_propagation(x)
    return np.argmax(o, axis=1)

RNNNumpy.predict = predict

In [53]:
np.random.seed(10)
model = RNNNumpy(vocab_size)
o, s = model.forward_propagation(X_train[10])
print (o.shape)
print (o)

(26, 6000)
[[0.00016729 0.00016697 0.00016586 ... 0.00016592 0.0001684  0.00016555]
 [0.00016781 0.00016574 0.00016578 ... 0.00016656 0.00016567 0.00016533]
 [0.00016646 0.00016792 0.00016709 ... 0.00016597 0.00016691 0.00016718]
 ...
 [0.0001651  0.00016728 0.00016629 ... 0.00016597 0.00016895 0.00016765]
 [0.00016599 0.00016787 0.00016766 ... 0.00016686 0.00016576 0.00016732]
 [0.00016677 0.00016667 0.00016543 ... 0.00016857 0.00016669 0.0001666 ]]


In [58]:
def calculate_total_loss(self, x, y):
    L = 0
    # For each sentence...
    for i in np.arange(len(y)):
        o, s = self.forward_propagation(x[i])
        # We only care about our prediction of the "correct" words
        correct_word_predictions = o[np.arange(len(y[i])), y[i]]
        # Add to the loss based on how off we were
        L += -1 * np.sum(np.log(correct_word_predictions))
    return L

def calculate_loss(self, x, y):
    # Divide the total loss by the number of training examples
    N = np.sum((len(y_i) for y_i in y))
    return self.calculate_total_loss(x,y)/N

RNNNumpy.calculate_total_loss = calculate_total_loss
RNNNumpy.calculate_loss = calculate_loss

In [60]:

# Limit to 1000 examples to save time
print ("Expected Loss for random predictions: {}".format( np.log(vocab_size)))
print ("Actual loss: {}".format( model.calculate_loss(X_train[:1000], y_train[:1000])))

Expected Loss for random predictions: 8.699514748210191
Actual loss: 8.699896541234262


In [61]:
def bptt(self, x, y):
    T = len(y)
    # Perform forward propagation
    o, s = self.forward_propagation(x)
    # We accumulate the gradients in these variables
    dLdU = np.zeros(self.U.shape)
    dLdV = np.zeros(self.V.shape)
    dLdW = np.zeros(self.W.shape)
    delta_o = o
    delta_o[np.arange(len(y)), y] -= 1.
    # For each output backwards...
    for t in np.arange(T)[::-1]:
        dLdV += np.outer(delta_o[t], s[t].T)
        # Initial delta calculation
        delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
        # Backpropagation through time (for at most self.bptt_truncate steps)
        for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
            # print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
            dLdW += np.outer(delta_t, s[bptt_step-1])              
            dLdU[:,x[bptt_step]] += delta_t
            # Update delta for next step
            delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
    return [dLdU, dLdV, dLdW]

RNNNumpy.bptt = bptt

In [67]:
def gradient_check(self, x, y, h=0.001, error_threshold=0.01):
    # Calculate the gradients using backpropagation. We want to checker if these are correct.
    bptt_gradients = model.bptt(x, y)
    # List of all parameters we want to check.
    model_parameters = ['U', 'V', 'W']
    # Gradient check for each parameter
    for pidx, pname in enumerate(model_parameters):
        # Get the actual parameter value from the mode, e.g. model.W
        parameter = operator.attrgetter(pname)(self)
        print ("Performing gradient check for parameter %s with size %d." % (pname, np.prod(parameter.shape)))
        # Iterate over each element of the parameter matrix, e.g. (0,0), (0,1), ...
        it = np.nditer(parameter, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            ix = it.multi_index
            # Save the original value so we can reset it later
            original_value = parameter[ix]
            # Estimate the gradient using (f(x+h) - f(x-h))/(2*h)
            parameter[ix] = original_value + h
            gradplus = model.calculate_total_loss([x],[y])
            parameter[ix] = original_value - h
            gradminus = model.calculate_total_loss([x],[y])
            estimated_gradient = (gradplus - gradminus)/(2*h)
            # Reset parameter to original value
            parameter[ix] = original_value
            # The gradient for this parameter calculated using backpropagation
            backprop_gradient = bptt_gradients[pidx][ix]
            # calculate The relative error: (|x - y|/(|x| + |y|))
            relative_error = np.abs(backprop_gradient - estimated_gradient)/(np.abs(backprop_gradient) + np.abs(estimated_gradient))
            # If the error is to large fail the gradient check
            if relative_error > error_threshold:
                print ("Gradient Check ERROR: parameter=%s ix=%s" % (pname, ix))
                print ("+h Loss: %f" % gradplus)
                print ("-h Loss: %f" % gradminus)
                print ("Estimated_gradient: %f" % estimated_gradient)
                print ("Backpropagation gradient: %f" % backprop_gradient)
                print ("Relative Error: %f" % relative_error)
                return 
            it.iternext()
        print ("Gradient check for parameter %s passed." % (pname))

RNNNumpy.gradient_check = gradient_check

# To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking.
grad_check_vocab_size = 100
np.random.seed(10)
model = RNNNumpy(grad_check_vocab_size, 10, bptt_truncate=1000)
model.gradient_check([0,1,2,3], [1,2,3,4])

Performing gradient check for parameter U with size 1000.




Gradient check for parameter U passed.
Performing gradient check for parameter V with size 1000.
Gradient check for parameter V passed.
Performing gradient check for parameter W with size 100.
Gradient check for parameter W passed.


In [68]:
# Performs one step of SGD.
def numpy_sdg_step(self, x, y, learning_rate):
    # Calculate the gradients
    dLdU, dLdV, dLdW = self.bptt(x, y)
    # Change parameters according to gradients and learning rate
    self.U -= learning_rate * dLdU
    self.V -= learning_rate * dLdV
    self.W -= learning_rate * dLdW

RNNNumpy.sgd_step = numpy_sdg_step

In [71]:
# Outer SGD Loop
# - model: The RNN model instance
# - X_train: The training data set
# - y_train: The training data labels
# - learning_rate: Initial learning rate for SGD
# - nepoch: Number of times to iterate through the complete dataset
# - evaluate_loss_after: Evaluate the loss after this many epochs
def train_with_sgd(model, X_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
    # We keep track of the losses so we can plot them later
    losses = []
    num_examples_seen = 0
    for epoch in range(nepoch):
        # Optionally evaluate the loss
        if (epoch % evaluate_loss_after == 0):
            loss = model.calculate_loss(X_train, y_train)
            losses.append((num_examples_seen, loss))
            time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print ("%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, num_examples_seen, epoch, loss))
            # Adjust the learning rate if loss increases
            if (len(losses) > 1 and losses[-1][1] > losses[-2][1]):
                learning_rate = learning_rate * 0.5  
                print ("Setting learning rate to %f" % learning_rate)
            sys.stdout.flush()
        # For each training example...
        for i in range(len(y_train)):
            # One SGD step
            model.sgd_step(X_train[i], y_train[i], learning_rate)
            num_examples_seen += 1

In [73]:
np.random.seed(10)
model = RNNNumpy(vocab_size)
%timeit model.sgd_step(X_train[10], y_train[10], 0.005)

70.5 ms ± 290 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [77]:
np.random.seed(10)
# Train on a small subset of the data to see what happens
model = RNNNumpy(vocab_size)
losses = train_with_sgd(model, X_train[:100], y_train[:100], nepoch=10, evaluate_loss_after=1)

2018-12-17 12:48:22: Loss after num_examples_seen=0 epoch=0: 8.700294
2018-12-17 12:48:28: Loss after num_examples_seen=100 epoch=1: 8.645224
2018-12-17 12:48:34: Loss after num_examples_seen=200 epoch=2: 6.918774
2018-12-17 12:48:39: Loss after num_examples_seen=300 epoch=3: 6.063775
2018-12-17 12:48:45: Loss after num_examples_seen=400 epoch=4: 5.696343
2018-12-17 12:48:51: Loss after num_examples_seen=500 epoch=5: 5.466472
2018-12-17 12:48:57: Loss after num_examples_seen=600 epoch=6: 5.324985
2018-12-17 12:49:02: Loss after num_examples_seen=700 epoch=7: 5.210804
2018-12-17 12:49:08: Loss after num_examples_seen=800 epoch=8: 5.092974
2018-12-17 12:49:14: Loss after num_examples_seen=900 epoch=9: 4.967416


https://github.com/dennybritz/rnn-tutorial-rnnlm/blob/master/RNNLM.ipynb