In [1]:
import math
import string
import re
from collections import defaultdict
import random
import gc
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora, models
import pyLDAvis.gensim

import os
import tempfile
import logging

TEMP_FOLDER = tempfile.gettempdir()
print('Folder "{}" will be used to save temporary dictionary and corpus.'.format(TEMP_FOLDER))



Folder "C:\Users\GyorgyM\AppData\Local\Temp" will be used to save temporary dictionary and corpus.


In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.CRITICAL)

In [3]:
tweets_csv = pd.read_csv("Data/twcs.csv")

# nltk.download("stopwords")
# nltk.download("wordnet")

tweets_csv.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0


In [4]:
tweets = tweets_csv.copy()

first_mask = tweets.in_response_to_tweet_id.isnull()
first_tweet_ids = tweets.loc[first_mask, "tweet_id"].values
tweet_conversation = pd.DataFrame({"tweet_id": first_tweet_ids, 
                                   "conversation_id": np.arange(first_tweet_ids.size) + 1}, 
                                  dtype="int64")

unmatched_tweets = tweets.loc[~first_mask, ["tweet_id", "in_response_to_tweet_id"]].astype("int64")

results = tweet_conversation.copy()

while not tweet_conversation.empty:
    merged = pd.merge(unmatched_tweets, tweet_conversation, how="left", left_on="in_response_to_tweet_id", right_on="tweet_id", 
                      suffixes=("", "_parent"))
    matched_mask = merged.tweet_id_parent.notnull()
    tweet_conversation = merged.loc[matched_mask, tweet_conversation.columns]
    results = results.append(tweet_conversation.copy(), ignore_index=True)
    unmatched_tweets = merged.loc[~matched_mask, unmatched_tweets.columns]

results["conversation_id"] = results.conversation_id.astype("int64")

tweets = pd.merge(tweets, results, on="tweet_id")

channels = tweets[~tweets.author_id.str.match(r"[0-9]+$")].groupby("conversation_id")["author_id"].first().to_frame("channel")
tweets = pd.merge(tweets, channels, left_on="conversation_id", right_index=True)

tweets.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,conversation_id,channel
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0,1,sprintcare
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0,1,sprintcare
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0,1,sprintcare
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0,1,sprintcare
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0,1,sprintcare


In [5]:
tweets.groupby("channel").count().sort_values("conversation_id", ascending=False)

Unnamed: 0_level_0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,conversation_id
channel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AmazonHelp,370222,370222,370222,370222,370222,251311,288378,370222
AppleSupport,237443,237443,237443,237443,237443,149034,157129,237443
Uber_Support,127801,127801,127801,127801,127801,78613,86038,127801
SpotifyCares,91459,91459,91459,91459,91459,58757,63278,91459
Delta,86553,86553,86553,86553,86553,50899,60595,86553
AmericanAir,85822,85822,85822,85822,85822,54815,59757,85822
TMobileHelp,79242,79242,79242,79242,79242,49899,56762,79242
Tesco,71824,71824,71824,71824,71824,39682,55236,71824
comcastcares,71468,71468,71468,71468,71468,42432,47660,71468
VirginTrains,65147,65147,65147,65147,65147,44670,50412,65147


In [6]:
channel_tweets = tweets[tweets.channel == "Uber_Support"]
channel_conversations = channel_tweets.groupby("conversation_id")["text"].apply(lambda s: s.str.cat(sep=" "))
channel_conversations.head()

conversation_id
113    @115872 Happy to follow up! Contact us via htt...
114    @115874 We're here to help! Send us a note her...
115    @115875 We’re here to help, Travis! Send us a ...
116    @115876 For more info about UberEats availabil...
117    @115878 We have received your DM and will foll...
Name: text, dtype: object

In [76]:
logging.getLogger().setLevel(logging.CRITICAL)

tokenizer = RegexpTokenizer(r"[\w-]+")
lemmatizer = WordNetLemmatizer()
stop = set(stopwords.words('english'))
url_pattern = re.compile(r"https\:\/\/t\.co\/[a-zA-Z0-9]+")
num_pattern = re.compile(r"[0-9-]+$")

def get_cleaned_doc(doc):
    cleaned = url_pattern.sub("?", doc)
    cleaned = cleaned.lower()
    cleaned = tokenizer.tokenize(cleaned)
    cleaned = [word for word in cleaned if num_pattern.match(word) is None]
    cleaned = [lemmatizer.lemmatize(word) for word in cleaned]
    cleaned = [word for word in cleaned if word not in stop]
    return cleaned

docs = [get_cleaned_doc(doc) for doc in channel_conversations]

# bigram = models.Phrases(docs, min_count=20)
# for idx in range(len(docs)):
#     for token in bigram[docs[idx]]:
#         if "_" in token:
#             docs[idx].append(token)

token_freq = defaultdict(int)
for doc in docs:
    for token in doc:
        token_freq[token] += 1

print("number of unique tokens: " + str(len(token_freq)))
print("tokens used only once: " + str(sum(freq == 1 for freq in token_freq.values())))

docs = [[token for token in doc if token_freq[token] > 1] for doc in docs]

dictionary = corpora.Dictionary(docs)
dictionary.filter_extremes(no_below=40)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in docs]

number of unique tokens: 29231
tokens used only once: 16275


In [19]:
rand_ind = np.random.randint(len(docs))
print(rand_ind)
print(channel_conversations.iloc[rand_ind])
print(docs[rand_ind])

20410
@465511 Happy to help. Please go here https://t.co/e8HVorX5rh, so we can take  a look. @Uber_Support Had a driver start the trip and head towards my destination, but never picked me up!
['happy', 'help', 'please', 'go', 'take', 'look', 'uber_support', 'driver', 'start', 'trip', 'head', 'towards', 'destination', 'never', 'picked']


In [22]:
logging.getLogger().setLevel(logging.DEBUG)

tfidf = models.TfidfModel(doc_term_matrix, dictionary=dictionary)
lda = models.LdaModel(doc_term_matrix, id2word=dictionary, num_topics=20, alpha='auto', iterations=200, passes=1)


In [None]:
prepared = pyLDAvis.gensim.prepare(lda, doc_term_matrix, dictionary, mds="tsne")
pyLDAvis.display(prepared)

In [29]:
def eval_config(corpus, texts, n_topics, k=10, iterations=100, passes=1, multicore=False):
    def get_lda_model(train_corpus):
        if multicore:
            return models.LdaMulticore(train_corpus, id2word=dictionary, num_topics=n_topics, alpha='asymmetric', eta='auto', 
                                       iterations=iterations, passes=passes)
        return models.LdaModel(train_corpus, id2word=dictionary, num_topics=n_topics, alpha='auto', eta='auto', 
                               iterations=iterations, passes=passes)
    
    random.shuffle(corpus)
    perps = []
    for i in range(k):
        val_from = round(i / k * len(corpus)) if k > 1 else 0
        val_to = round((i + 1) / k * len(corpus)) if k > 1 else 0
        train = corpus[:val_from] + corpus[val_to:]
        val = corpus[val_from:val_to]
        
        lda = get_lda_model(train)
        perps.append(lda.log_perplexity(val if k > 1 else train))
        print("fold " + str(i + 1) + "/" + str(k) + " finished")
    
    lda = get_lda_model(corpus)
    umass = models.CoherenceModel(model=lda, corpus=corpus, coherence='u_mass')
    cv = models.CoherenceModel(model=lda, texts=texts, coherence='c_v')
    
    return (perps, umass.get_coherence(), cv.get_coherence(), lda)

In [None]:
logging.getLogger().setLevel(logging.CRITICAL)
topics = [2, 5, 10, 20, 40]
results = [eval_config(doc_term_matrix, docs, n_topics) for n_topics in topics]


In [87]:
def wglobal(docfreq, totaldocs):
    return totaldocs / docfreq

tfidf = models.TfidfModel(doc_term_matrix, dictionary=dictionary, normalize=False, wglobal=wglobal)
lda = models.LdaModel(tfidf[doc_term_matrix], id2word=dictionary, num_topics=10, alpha='auto', iterations=400, passes=1)
prepared = pyLDAvis.gensim.prepare(lda, doc_term_matrix, dictionary, mds="tsne")
pyLDAvis.display(prepared)

In [13]:
lda.show_topics(num_topics=20)

[(0,
  '0.013*"minute" + 0.013*"note" + 0.011*"charged" + 0.011*"wa" + 0.011*"cancel" + 0.011*"driver" + 0.011*"connect" + 0.010*"min" + 0.010*"fee" + 0.010*"team"'),
 (1,
  '0.017*"krispy" + 0.015*"kreme" + 0.006*"ya" + 0.006*"overcharged" + 0.005*"debit" + 0.005*"bill" + 0.005*"driver" + 0.005*"nov" + 0.005*"starting" + 0.005*"event"'),
 (2,
  '0.018*"phone" + 0.011*"left" + 0.010*"account" + 0.009*"contact" + 0.009*"help" + 0.009*"lost" + 0.008*"number" + 0.008*"password" + 0.008*"uber" + 0.007*"driver"'),
 (3,
  '0.010*"wa" + 0.009*"driver" + 0.009*"charged" + 0.008*"note" + 0.008*"demand" + 0.007*"trip" + 0.007*"rating" + 0.007*"twice" + 0.007*"dozen" + 0.007*"team"'),
 (4,
  '0.014*"forgot" + 0.013*"mcdonalds" + 0.009*"mcdonald" + 0.008*"iphone" + 0.008*"city" + 0.007*"arriving" + 0.007*"drink" + 0.007*"window" + 0.007*"fry" + 0.007*"create"'),
 (5,
  '0.027*"fun" + 0.017*"delivered" + 0.014*"hungry" + 0.013*"food" + 0.011*"smh" + 0.010*"ordering" + 0.010*"order" + 0.009*"accepte

In [34]:
gc.collect()

511