# Import Cleaned Data

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

data = pd.read_csv('./data/cleaned.csv', encoding='utf-8')

from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
tokenizer = TweetTokenizer()

def tokenize(tweet):
    try:
        tokens = tokenizer.tokenize(tweet)
        return tokens
    except:
        return 'NC'


from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

# Post Process

In [2]:
# post process
data['clean_text'] = data['clean_text'].progress_map(tokenize)
data = data[data.clean_text != 'NC']
data.reset_index(inplace=True)
data.drop('index', inplace=True, axis=1)
# print(data.head())

progress-bar: 100%|██████████| 1600000/1600000 [00:30<00:00, 52802.96it/s]


# Split into Train and Test Data

In [17]:
x_train, x_test, y_train, y_test = train_test_split(np.array(data['clean_text']), np.array(data['target']),
                                                    test_size=0.2)
x_test_pure = x_test

# Word2Vec

In [4]:
import gensim
from gensim.models.word2vec import Word2Vec
LabeledSentence = gensim.models.doc2vec.LabeledSentence

def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

x_train = labelizeTweets(x_train, 'TRAIN')
print(x_train[0:2])
x_test = labelizeTweets(x_test, 'TEST')

print("Building word2vec matrix")
n_dim = 200
tweet_w2v = Word2Vec(size=n_dim, min_count=10)
tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])
tweet_w2v.train([x.words for x in tqdm(x_train)], total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.epochs)

  if __name__ == '__main__':
1269600it [00:05, 234273.78it/s]


[LabeledSentence(words=['knoww', 'crazi', 'coupl', 'day', 'thank', 'tweet', 'rescu'], tags=['TRAIN_0']), LabeledSentence(words=['ouch', 'least', 'carniv'], tags=['TRAIN_1'])]


317400it [00:00, 347372.13it/s]


Building word2vec matrix


100%|██████████| 1269600/1269600 [00:00<00:00, 2360530.79it/s]
100%|██████████| 1269600/1269600 [00:00<00:00, 2193451.54it/s]


(36988349, 43740285)

# TF-IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
print("Building tf-idf matrix")
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in x_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print("Vocab size: ", len(tfidf))

Building tf-idf matrix
vocab size:  20216


# Scale the Data

In [6]:
def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += tweet_w2v[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec


from sklearn.preprocessing import scale
print("Scaling train vector")
train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)

print("Scaling test vector")
test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)

Scaling train vector


  
1269600it [01:23, 15296.04it/s]


Scaling test vector


  
317400it [00:20, 15253.48it/s]


# Build Keras Based Model

In [7]:
# build keras based model
from keras.models import Sequential
from keras.layers import Dense

print("Making model")
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=200))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

Using TensorFlow backend.


Making model
Instructions for updating:
Colocations handled automatically by placer.


# Train Model

In [8]:
print("Training model")
model.fit(train_vecs_w2v, y_train, epochs=9, batch_size=32, verbose=2)

Training model
Instructions for updating:
Use tf.cast instead.
Epoch 1/9
 - 51s - loss: -1.7511e+01 - acc: 0.1024
Epoch 2/9
 - 52s - loss: -1.7647e+01 - acc: 0.0984
Epoch 3/9
 - 50s - loss: -1.7638e+01 - acc: 0.0944
Epoch 4/9
 - 50s - loss: -1.7620e+01 - acc: 0.0931
Epoch 5/9
 - 51s - loss: -1.7601e+01 - acc: 0.0913
Epoch 6/9
 - 49s - loss: -1.7585e+01 - acc: 0.0902
Epoch 7/9
 - 47s - loss: -1.7573e+01 - acc: 0.0908
Epoch 8/9
 - 45s - loss: -1.7566e+01 - acc: 0.0902
Epoch 9/9
 - 51s - loss: -1.7562e+01 - acc: 0.0909


<keras.callbacks.History at 0x1e619115b00>

# Evaluate and Analyze Model Performance

In [12]:
print("Evaluating model")
score = model.evaluate(test_vecs_w2v, y_test, batch_size=128, verbose=2)
print(score[1])

Evaluating model
0.10874606175173283


In [23]:
from textblob import TextBlob

def get_tweet_sentiment(tweet): 
        ''' 
        Utility function to classify sentiment of passed tweet 
        using textblob's sentiment method 
        '''
        # create TextBlob object of passed tweet text 
        analysis = TextBlob(tweet) 
        # set sentiment 
        if analysis.sentiment.polarity > 0: 
            return 0
        elif analysis.sentiment.polarity == 0: 
            return 2
        else: 
            return 4

print("Evaluating TextBlob")
right = 0
for i, tweet in tqdm(enumerate(x_test_pure)):
    if(get_tweet_sentiment(" ".join(tweet)) == y_test[i]):
        right += 1
print(right/len(x_test_pure))

Evaluating TextBlob


317400it [00:43, 7230.23it/s]


0.15982041587901702
