Loading Data

In [13]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim
from gensim.models.word2vec import Word2Vec
from nltk.tokenize import TweetTokenizer

from tqdm import tqdm

tqdm.pandas(desc='progress-bar')
tokenizer = TweetTokenizer()
LabeledSentence = gensim.models.doc2vec.LabeledSentence

Import and extract the columns:

* Sentiment: Boolean Variable
* Tweet Text: String

In [2]:
def ingest():
    data = pd.read_csv('training.1600000.processed.noemoticon.csv')
    data.drop(['ItemID', 'Date', 'Blank', 'SentimentSource'], axis=1, inplace=True)
    data['Sentiment'] = data['Sentiment'].map(int).map({4: 1, 0: 0})
    data = data[data['SentimentText'].isnull() == False]
    data.reset_index(drop=True, inplace=True)
    print('Data Shape: {}'.format(data.shape))
    return data

data = ingest()
data.head(5)

Data Shape: (1600000, 2)


Unnamed: 0,Sentiment,SentimentText
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [3]:
def tokenize(tweet):
    try:
        tweet = unicode(tweet.decode('utf-8').lower())
        tokens = tokenizer.tokenize(tweet)
        tokens = filter(lambda t: not t.startswith('@'), tokens)
        tokens = filter(lambda t: not t.startswith('#'), tokens)
        tokens = filter(lambda t: not t.startswith('http'), tokens)
        return list(tokens)
    except:
        return 'NC'

def postprocess(data):
    data['tokens'] = data['SentimentText'].progress_map(tokenize)
    data = data[data['tokens'] != 'NC']
    data.reset_index(drop=True, inplace=True)
    return data

processed = postprocess(data)
processed.to_csv('train_processed.csv', index=False)

progress-bar: 100%|██████████| 1600000/1600000 [05:08<00:00, 5189.16it/s]


In [4]:
x_train, x_test, y_train, y_test = train_test_split(np.array(processed['tokens']),
                                                    np.array(processed['Sentiment']),
                                                    test_size=0.2)

In [5]:
def labelize_tweets(tweets, label_type):
    labelized = []
    for i, v in tqdm(enumerate(tweets)):
        label = '{}_{}'.format(label_type, i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

x_train = labelize_tweets(x_train, 'TRAIN')
x_test = labelize_tweets(x_test, 'TEST')

1278970it [00:16, 75467.31it/s] 
319743it [00:02, 127230.20it/s]


In [6]:
n_dim = 200
tweet_w2v = Word2Vec(size=n_dim, min_count=10)
tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])
tweet_w2v.train([x.words for x in tqdm(x_train)],
                total_examples=tweet_w2v.corpus_count,
                epochs=tweet_w2v.iter)

100%|██████████| 1278970/1278970 [00:01<00:00, 647976.44it/s]
100%|██████████| 1278970/1278970 [00:01<00:00, 822819.50it/s]


67818627

In [7]:
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

output_notebook()
plot_tfidf = figure(plot_width=700, plot_height=600, title='A map of 1,000 word vectors',
                    tools='pan,wheel_zoom,box_zoom,reset,hover,previewsave', x_axis_type=None,
                    y_axis_type=None, min_border=1)

word_vectors = [tweet_w2v[w] for w in list(tweet_w2v.wv.vocab.keys())[:2500]]

# dimensionality reduction
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)

tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = list(tweet_w2v.wv.vocab.keys())[:2500]

plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={'word': '@words'}
show(plot_tfidf)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2500 samples in 0.027s...
[t-SNE] Computed neighbors for 2500 samples in 2.620s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2500
[t-SNE] Computed conditional probabilities for sample 2000 / 2500
[t-SNE] Computed conditional probabilities for sample 2500 / 2500
[t-SNE] Mean sigma: 0.293814
[t-SNE] KL divergence after 250 iterations with early exaggeration: 90.084854
[t-SNE] Error after 1000 iterations: 2.494616


In [8]:
vectorizer = TfidfVectorizer(analyzer = lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in x_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print('Vocab Size: {}'.format(len(tfidf)))

Vocab Size: 30606


In [9]:
def build_word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += tweet_w2v[word].reshape((1, size)) * tfidf[word]
            count += 1
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

from sklearn.preprocessing import scale
train_vecs_w2v = np.concatenate([build_word_vector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([build_word_vector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)

100%|██████████| 1278970/1278970 [06:09<00:00, 3462.73it/s]
100%|██████████| 319743/319743 [01:33<00:00, 3427.13it/s]


In [21]:
from keras.models import Sequential
from keras.layers import Dense

In [22]:
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=200))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_vecs_w2v, y_train, epochs=9, batch_size=32, verbose=2)
score = model.evaluate(test_vecs_w2v, y_test, batch_size=128, verbose=2)
print(score[1])

Epoch 1/9
233s - loss: 0.4562 - acc: 0.7849
Epoch 2/9
262s - loss: 0.4445 - acc: 0.7922
Epoch 3/9
269s - loss: 0.4411 - acc: 0.7943
Epoch 4/9
270s - loss: 0.4394 - acc: 0.7952
Epoch 5/9
269s - loss: 0.4383 - acc: 0.7960
Epoch 6/9
271s - loss: 0.4377 - acc: 0.7960
Epoch 7/9
265s - loss: 0.4372 - acc: 0.7962
Epoch 8/9
261s - loss: 0.4368 - acc: 0.7967
Epoch 9/9
248s - loss: 0.4367 - acc: 0.7964
0.793893845989
