In [4]:
% matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import warnings
import numpy as np
from collections import defaultdict
from operator import itemgetter
import re

from utils import f1, text_to_wordlist, user_id
from utils import clean_reaction, sub_user, encoder_predict, user_dict, channel_mapping


warnings.filterwarnings("ignore")


# main message store
with open('input_data/dan_bot_messages.pkl', 'rb') as f:
    message_list, _ = pickle.load(f)
    print(len(message_list))

10975


In [5]:
store = {}
emoji_count = defaultdict(int)

for msg in message_list:
    try:
        if 'message' in msg:
            msg_type, msg_info, channel = itemgetter('type', 'message', 'channel')(msg)
            msg_info_type, msg_text, msg_reactions, msg_time = itemgetter('type', 'text', 'reactions', 'ts')(msg['message'])
            if 'message' in msg and msg_text.strip != '':
                msg_text = sub_user(msg_text)
                clean_msg = text_to_wordlist(msg_text)
                store[clean_msg] = {}
                store[clean_msg]['reactions'] = [clean_reaction(reaction['name']) for reaction in msg_reactions if user_id in reaction['users'] ]
                store[clean_msg]['time'] = msg_time
                store[clean_msg]['type'] = 'message'
                store[clean_msg]['joined_reactions'] = '|'.join(store[clean_msg]['reactions'])
                store[clean_msg]['channel'] = channel
                store[clean_msg]['type'] = msg['type']
                if 'user' in msg['message']:
                    store[clean_msg]['user'] = msg['message']['user']
                elif 'bot_id' in msg['message']:
                    store[clean_msg]['user'] = msg['message']['bot_id']
                for emoji in store[clean_msg]:
                    emoji_count[emoji] += 1
        elif 'file' in msg:
            continue
    except Exception as e:
        print(e)

long_store = []
for k, v in store.items():
    for reaction in v['reactions']:
        long_store.append(
            {'comment': k,
             'emoji': reaction, 
             'channel': channel_mapping.get(v['channel'], 'private'), 
             'time': float(v['time']),
             'user': user_dict.get(v.get('user', 'None'), 'bot'),
             'type': v['type']
            }
        )
        
long_data = pd.DataFrame(long_store)
long_data['time'] =  pd.to_datetime(long_data['time'],unit='s')
long_data = long_data[long_data['time'].dt.year > 2016]

print(long_data.shape)
long_data.tail()

'U9BT72G5T|tristan.findlay'
'U029T9GDF|brendon'
(7841, 6)


Unnamed: 0,channel,comment,emoji,time,type,user
7838,tech_articles,<http lifepluslinux blogspot co za 2017 01 ...,amasin,2017-01-26 12:58:05.000271082,message,kingori
7839,blockers,i threw my laptop out the window got a new on...,joy,2017-01-26 11:40:05.000296116,message,brendon
7840,blockers,i threw my laptop out the window got a new on...,raised_hands,2017-01-26 11:40:05.000296116,message,brendon
7841,london,the physicists will fight dirty,fire,2017-01-25 17:15:09.002577066,message,helen
7842,london,the physicists will fight dirty,gun,2017-01-25 17:15:09.002577066,message,helen


In [11]:
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

w=widgets.Dropdown(
    options=long_data['channel'].unique().tolist() + ['all'],
    value='all',
    description='Channel:',
    disabled=False,
)

def my_plot(w):
    fig, ax = plt.subplots(figsize=(12, 7))
    tmp = long_data if w == 'all' else long_data[long_data['channel'] == w]
    tmp.groupby('emoji').count()['time'].rename('count').sort_values(ascending=False).head(35).plot(grid=True, ax=ax, kind='bar');
    ax.set_title('Emoji counts for channel: %s' % w);
    plt.show()

interactive_plot = interactive(my_plot, w=w)
interactive_plot

A Jupyter Widget

In [8]:
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

w=widgets.Dropdown(
    options=long_data['emoji'].unique().tolist() + ['all'],
    value='all',
    description='Emoji:',
    disabled=False,
)

def my_plot(w):
    fig, ax = plt.subplots(figsize=(12, 8))
    tmp = long_data if w == 'all' else long_data[long_data['emoji'] == w]
    tmp.groupby(pd.Grouper(key='time', freq='M')).count()['emoji'].rolling(2).mean().plot(grid=True, color = 'b');
    ax.set_title('Emoji Count Over Time');
    plt.show()

interactive_plot = interactive(my_plot, w=w)
interactive_plot


A Jupyter Widget

In [9]:
agg = long_data.groupby('emoji').count()['comment']

emoji_counts = pd.DataFrame(agg.sort_values(ascending=False))
filtered_emojis = emoji_counts[emoji_counts['comment'] > 20]
df = long_data[long_data['emoji'].isin(filtered_emojis.index)]
print('filering comments with most commonn emojis, down to ', df.shape)
filtered_emojis.head(10)

filering comments with most commonn emojis, down to  (5374, 6)


Unnamed: 0_level_0,comment
emoji,Unnamed: 1_level_1
joy,840
+1,566
true_story,450
fire,324
notsureif,279
squanchy,188
amasin,167
trophy,141
facepalm,134
100,132


In [10]:
formatted_table = pd.get_dummies(df, columns = ['emoji']).groupby(['comment', 'time', 'channel', 'type', 'user']).sum().reset_index()
print(formatted_table.shape)
formatted_table.head()

(4633, 66)


Unnamed: 0,comment,time,channel,type,user,emoji_+1,emoji_100,emoji_amasin,emoji_beers,emoji_brendan,...,emoji_tinfoilhat,emoji_trollface,emoji_trophy,emoji_true_story,emoji_trump,emoji_wat,emoji_watchout_badass,emoji_whip,emoji_wow,emoji_wow_savage
0,,2017-04-03 09:01:27.502793073,general,message,tommy,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,thinking that one above t...,2018-10-22 12:50:34.000099897,productowners,message,sam,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,yes,2018-06-06 09:06:01.000616074,random,message,stuart,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1 pry main gt; x = a 2 b c 3 ...,2017-03-28 13:26:11.006568909,tricks_of_the_trade,message,thom,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,begin if stevebissett = = nil raise arg...,2017-05-22 10:59:04.779649973,capetown,message,eliza,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import OneHotEncoder
import keras
from utils import f1

channel_encoder = OneHotEncoder()
user_encoder = OneHotEncoder()

one_hot_channels = channel_encoder.fit_transform(formatted_table['channel'].values.reshape(-1, 1))
one_hot_users = user_encoder.fit_transform(formatted_table['user'].values.reshape(-1, 1))

max_words = 10000

# TF IDF
vectorizer = TfidfVectorizer(max_features = max_words)
X = vectorizer.fit_transform(formatted_table.comment.tolist())

# Sentiment feature
analyser = SentimentIntensityAnalyzer()
sentiment = formatted_table.comment.apply(lambda x: analyser.polarity_scores(x)['compound']).values.reshape(-1, 1)
print(sentiment.shape)

# Channel feature
channel_encoder = OneHotEncoder()
one_hot_channels = channel_encoder.fit_transform(formatted_table['channel'].values.reshape(-1, 1))

# User feature
user_encoder = OneHotEncoder()
one_hot_users = user_encoder.fit_transform(formatted_table['user'].values.reshape(-1, 1))

# Concat features & split train/test
X = np.concatenate((X.toarray(), sentiment, one_hot_channels.toarray(), one_hot_users.toarray()), axis=1)
y_cols = [i for i in formatted_table.columns if 'emoji' in i]
Y = formatted_table[y_cols]

print(X.shape, Y.shape)


Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=1000)
print(Xtrain.shape, Ytrain.shape)
print(Xtest.shape, Ytest.shape)

(4633, 1)
(4633, 8594) (4633, 61)
(3706, 8594) (3706, 61)
(927, 8594) (927, 61)


In [25]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM
from keras.optimizers import SGD
from keras import regularizers
from keras.callbacks import EarlyStopping

model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(Xtrain.shape[1],)))
model.add(Dropout(0.3))
model.add(Dense(Ytrain.shape[1]))
model.add(Activation('softmax'))

print(model.summary())

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', f1])

batch_size = 32
epochs = 25

early_stopping = EarlyStopping(patience=3)
history = model.fit(Xtrain, Ytrain, 
                    batch_size=batch_size, 
                    epochs=epochs, 
                    verbose=1, 
                    validation_data=(Xtest, Ytest), 
#                     callbacks=[early_stopping]
                   )
score = model.evaluate(Xtest, Ytest, batch_size=batch_size, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
print('Test F1', score[2])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 512)               4400640   
_________________________________________________________________
dropout_4 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 61)                31293     
_________________________________________________________________
activation_4 (Activation)    (None, 61)                0         
Total params: 4,431,933
Trainable params: 4,431,933
Non-trainable params: 0
_________________________________________________________________
None
Train on 3706 samples, validate on 927 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch

In [26]:
def process_pred(sentence, channel, user):
    xpred = vectorizer.transform([sentence])
    new_x = np.concatenate(
        (
            xpred.toarray(),
            np.array([analyser.polarity_scores(sentence)
                      ['compound']]).reshape(-1, 1),
            encoder_predict(channel_encoder, channel).toarray(),
            encoder_predict(user_encoder, user).toarray()
        ), axis=1)
    pred = model.predict(new_x)
    return y_cols[np.argmax(pred)].split('emoji_')[-1]

test_sentences = [
    'brexit makes me sad',
    'great job on getting autocoding out, you massive nerds',
    'on leave that week', 
    'production is down', 
    'just be better', 
    'work harder', 
    'rocket to production', 
    'thats just wrong',
    'windows over mac',
    'you are a bell end',
    "it's not unreasonable to have a w9am meeting",
    "My understanding from talking to different folks is the issue is due to the different text length",
    '@steven.perianen IBM is loving the new verbatim auto coding!',
    'heyhey @daniel.baark -> https://zigroup.atlassian.net/browse/SP-5320',
    "The new DS review time clashes with another meeting",
    "It's not like me to skip meals",
    "There has been a complaint about people using the putney office and keeping the door propped open. Can people make sure the door isn't kept open when it shouldn't be.",
    "Ahh we call them a Microsoft Product Team"
]

for i, sent in enumerate(test_sentences):
    print(sent, '->', process_pred(sent, 'london', 'fish'))

brexit makes me sad -> joy
great job on getting autocoding out, you massive nerds -> joy
on leave that week -> +1
production is down -> notsureif
just be better -> true_story
work harder -> joy
rocket to production -> eggplant
thats just wrong -> fire
windows over mac -> wat
you are a bell end -> fire
it's not unreasonable to have a w9am meeting -> joy
My understanding from talking to different folks is the issue is due to the different text length -> got_wood
@steven.perianen IBM is loving the new verbatim auto coding! -> trophy
heyhey @daniel.baark -> https://zigroup.atlassian.net/browse/SP-5320 -> +1
The new DS review time clashes with another meeting -> joy
It's not like me to skip meals -> got_wood
There has been a complaint about people using the putney office and keeping the door propped open. Can people make sure the door isn't kept open when it shouldn't be. -> notsureif
Ahh we call them a Microsoft Product Team -> joy


In [39]:
pickle.dump(vectorizer, open("input_data/tfidf.pickle", "wb"))
pickle.dump(channel_encoder, open("input_data/channel_enc.pickle", "wb"))
pickle.dump(user_encoder, open("input_data/user_enc.pickle", "wb"))
pickle.dump(y_cols, open("input_data/y_cols.pickle", "wb"))
model.save('my_model.h5')