In [1]:
import pandas as pd
import numpy as np
np.random.seed(0)
from nltk import word_tokenize
from gensim.models import word2vec

In [2]:
posts_df = pd.read_csv('data/posts.csv')
posts_df.head()

Unnamed: 0.1,Unnamed: 0,age,comment,likes/views,link,promo,user
0,0,2 DAYS AGO,found my new favorite park!,"405,059 likes",/BytNlrQhRx8/,0,chrissyteigen
1,1,3 DAYS AGO,Happy bebe!,1739218,/Byqz8uZh73s/,0,chrissyteigen
2,2,5 DAYS AGO,coated in a paste of fresh garlic and filled w...,2931603,/BymErW1B9eL/,0,chrissyteigen
3,3,5 DAYS AGO,this kid,"371,095 likes",/Byl-aHjBXFX/,0,chrissyteigen
4,4,JUNE 8,home tomorrow 😩,"859,039 likes",/ByduG0BB_A3/,0,chrissyteigen


In [22]:
posts_df.shape

(1065, 7)

In [3]:
target = posts_df.promo
data = posts_df['comment'].map(word_tokenize).values

In [4]:
total_vocabulary = set(word for comment in data for word in comment)
print("There are {} unique tokens in our dataset.".format(len(total_vocabulary)))

There are 8994 unique tokens in our dataset.


In [5]:
glove = {}
with open('data/glove_data/glove.6B.50d.txt', 'rb') as f:
    for line in f:
        parts = line.split()
        word = parts[0].decode('utf-8')
        if word in total_vocabulary:
            vector = np.array(parts[1:], dtype=np.float32)
            glove[word] = vector

In [6]:
glove['cool']

array([-0.65575 ,  0.45659 , -0.16748 , -0.58345 , -0.23073 , -0.78348 ,
       -0.23166 , -0.022452, -0.57968 ,  0.526   , -0.2214  ,  0.17614 ,
        0.46513 ,  0.79142 ,  0.017403,  1.0879  ,  0.24418 ,  0.27523 ,
       -0.26452 , -1.0389  ,  0.014045,  0.68459 ,  0.98151 ,  0.21561 ,
        0.36278 , -0.51819 , -0.40552 ,  1.349   ,  1.5399  ,  0.60541 ,
        2.6604  ,  0.074535, -0.076292,  0.12501 , -0.026268,  0.16843 ,
       -0.41844 ,  0.44505 ,  0.25033 , -1.1557  ,  0.24575 ,  0.41847 ,
       -0.10633 , -0.28433 ,  0.51215 ,  0.51371 ,  0.53004 , -0.889   ,
        0.054744,  0.78793 ], dtype=float32)

#### Create Mean Word Embeddings

In [7]:
class W2vVectorizer(object):
    
    def __init__(self, w2v):
        # takes in a dictionary of words and vectors as input
        self.w2v = w2v
        if len(w2v) == 0:
            self.dimensions = 0
        else:
            self.dimensions = len(w2v[next(iter(glove))])
    
    # Note from Mike: Even though it doesn't do anything, it's required that this object implement a fit method or else
    # It can't be used in a sklearn Pipeline. 
    def fit(self, X, y):
        return self
            
    def transform(self, X):
        return np.array([
            np.mean([self.w2v[w] for w in words if w in self.w2v]
                   or [np.zeros(self.dimensions)], axis=0) for words in X])

#### Using Pipelines

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

rf =  Pipeline([("Word2Vec Vectorizer", W2vVectorizer(glove)),
              ("Random Forest", RandomForestClassifier(n_estimators=100, verbose=True))])
svc = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(glove)),
                ('Support Vector Machine', SVC())])
lr = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(glove)),
              ('Logistic Regression', LogisticRegression())])

In [9]:
models = [('Random Forest', rf),
          ("Support Vector Machine", svc),
          ("Logistic Regression", lr)]

In [10]:
scores = [(name, cross_val_score(model, data, target, cv=2).mean()) for name, model, in models]

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [11]:
scores

[('Random Forest', 0.6281722128962179),
 ('Support Vector Machine', 0.5624321121753728),
 ('Logistic Regression', 0.5877851288634344)]

#### Deep Learning with word embeddings

In [13]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence

In [14]:
y = pd.get_dummies(target).values

In [15]:
tokenizer = text.Tokenizer(num_words=20000)
tokenizer.fit_on_texts(list(posts_df.comment))
list_tokenized_headlines = tokenizer.texts_to_sequences(posts_df.comment)
X_t = sequence.pad_sequences(list_tokenized_headlines, maxlen=100)

In [16]:
from tensorflow.python.framework import ops
ops.reset_default_graph()

In [17]:
embedding_size = 128
input_ = Input(shape=(100,))
x = Embedding(20000, embedding_size)(input_)
x = LSTM(25, return_sequences=True)(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.5)(x)
x = Dense(50, activation='relu')(x)
x = Dropout(0.5)(x)
# There are 2 different possible classes, so we use 2 neurons in our output layer
x = Dense(2, activation='softmax')(x)

model = Model(inputs=input_, outputs=x)

In [18]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [19]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 128)          2560000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 100, 25)           15400     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 25)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 25)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                1300      
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
__________

In [20]:
model.fit(X_t, y, epochs=2, batch_size=32, validation_split=0.1)

Train on 958 samples, validate on 107 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1a31760390>

In [21]:
model.fit(X_t, y, epochs=5, batch_size=32, validation_split=0.1)

Train on 958 samples, validate on 107 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1a32c8fcf8>

Traceback (most recent call last):
  File "/Users/sherzyang/anaconda3/lib/python3.7/site-packages/pkg_resources/__init__.py", line 583, in _build_master
    ws.require(__requires__)
  File "/Users/sherzyang/anaconda3/lib/python3.7/site-packages/pkg_resources/__init__.py", line 900, in require
    needed = self.resolve(parse_requirements(requirements))
  File "/Users/sherzyang/anaconda3/lib/python3.7/site-packages/pkg_resources/__init__.py", line 791, in resolve
    raise VersionConflict(dist, req).with_context(dependent_req)
pkg_resources.VersionConflict: (pip 19.1.1 (/Users/sherzyang/anaconda3/lib/python3.7/site-packages), Requirement.parse('pip==18.1'))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/sherzyang/anaconda3/bin/pip3", line 6, in <module>
    from pkg_resources import load_entry_point
  File "/Users/sherzyang/anaconda3/lib/python3.7/site-packages/pkg_resources/__init__.py", line 3241, in

In [5]:
s = u'\U0001f600'
import emoji
from emoji.unicode_codes import UNICODE_EMOJI

print (UNICODE_EMOJI[s])


ModuleNotFoundError: No module named 'emoji'