In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.CRITICAL)

In [2]:
import nltk
# nltk.download("stopwords")
# nltk.download("wordnet")

In [3]:
import pandas as pd
tweets_csv = pd.read_csv("data/twcs.csv")
tweets_csv.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0


In [4]:
import numpy as np

tweets = tweets_csv.copy()

first_mask = tweets.in_response_to_tweet_id.isnull()
first_tweet_ids = tweets.loc[first_mask, "tweet_id"].values
tweet_conversation = pd.DataFrame({"tweet_id": first_tweet_ids, 
                                   "conversation_id": np.arange(first_tweet_ids.size) + 1}, 
                                  dtype="int64")

unmatched_tweets = tweets.loc[~first_mask, ["tweet_id", "in_response_to_tweet_id"]].astype("int64")

results = tweet_conversation.copy()

while not tweet_conversation.empty:
    merged = pd.merge(unmatched_tweets, tweet_conversation, how="left", left_on="in_response_to_tweet_id", right_on="tweet_id", 
                      suffixes=("", "_parent"))
    matched_mask = merged.tweet_id_parent.notnull()
    tweet_conversation = merged.loc[matched_mask, tweet_conversation.columns]
    results = results.append(tweet_conversation.copy(), ignore_index=True)
    unmatched_tweets = merged.loc[~matched_mask, unmatched_tweets.columns]

results["conversation_id"] = results.conversation_id.astype("int64")

tweets = pd.merge(tweets, results, on="tweet_id")

channels = tweets[~tweets.author_id.str.match(r"[0-9]+$")].groupby("conversation_id")["author_id"].first().to_frame("channel")
tweets = pd.merge(tweets, channels, left_on="conversation_id", right_index=True)

tweets.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,conversation_id,channel
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0,1,sprintcare
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0,1,sprintcare
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0,1,sprintcare
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0,1,sprintcare
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0,1,sprintcare


In [5]:
tweets.groupby("channel").count().sort_values("conversation_id", ascending=False)

Unnamed: 0_level_0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,conversation_id
channel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AmazonHelp,370222,370222,370222,370222,370222,251311,288378,370222
AppleSupport,237443,237443,237443,237443,237443,149034,157129,237443
Uber_Support,127801,127801,127801,127801,127801,78613,86038,127801
SpotifyCares,91459,91459,91459,91459,91459,58757,63278,91459
Delta,86553,86553,86553,86553,86553,50899,60595,86553
AmericanAir,85822,85822,85822,85822,85822,54815,59757,85822
TMobileHelp,79242,79242,79242,79242,79242,49899,56762,79242
Tesco,71824,71824,71824,71824,71824,39682,55236,71824
comcastcares,71468,71468,71468,71468,71468,42432,47660,71468
VirginTrains,65147,65147,65147,65147,65147,44670,50412,65147


In [8]:
channel_tweets = tweets.groupby("channel")["text"].apply(lambda s: s.str.cat(sep=" "))
channel_tweets.head()

channel
ATT               @118611 Glad you are getting excited! Be sure ...
ATVIAssist        @115753 It appears this has been removed. Than...
AWSSupport        @123643 I've added your +1 for wanting to see ...
AdobeCare         @115767 Sorry that this isn't more simple... l...
AirAsiaSupport    @115797 Hi Loreen,kindly DM us your booking nu...
Name: text, dtype: object

In [9]:
from nltk.tokenize import RegexpTokenizer
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from collections import defaultdict
from gensim.corpora import Dictionary

tokenizer = RegexpTokenizer(r"\w+")
lemmatizer = WordNetLemmatizer()
stop = set(stopwords.words('english'))
url_pattern = re.compile(r"https\:\/\/t\.co\/[a-zA-Z0-9]+")
num_pattern = re.compile(r"[0-9]+$")

def get_cleaned_doc(doc):
    cleaned = url_pattern.sub("?", doc)
    cleaned = cleaned.lower()
    cleaned = tokenizer.tokenize(cleaned)
    cleaned = [word for word in cleaned if num_pattern.match(word) is None]
    cleaned = [lemmatizer.lemmatize(word) for word in cleaned]
    cleaned = [word for word in cleaned if word not in stop]
    return cleaned

docs = [get_cleaned_doc(doc) for doc in channel_tweets]

token_freq = defaultdict(int)
for doc in docs:
    for token in doc:
        token_freq[token] += 1

print("number of unique tokens: " + str(len(token_freq)))
print("tokens used only once: " + str(sum(freq == 1 for freq in token_freq.values())))

docs = [[token for token in doc if token_freq[token] > 1] for doc in docs]

dictionary = Dictionary(docs)
dictionary.filter_extremes(no_below=1, keep_n=None)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in docs]



number of unique tokens: 332278
tokens used only once: 212504


NameError: name 'corpora' is not defined

In [20]:
from gensim.models import TfidfModel
tfidf = TfidfModel(doc_term_matrix, dictionary=dictionary)
tfidf[doc_term_matrix[0]]

[(0, 0.0016039378922466023),
 (1, 0.0005907894686011224),
 (2, 0.0002872155960091963),
 (3, 0.0003609220130371575),
 (4, 0.000552176601691905),
 (5, 0.0008358000153539123),
 (6, 0.0018366571369273655),
 (7, 0.00022268985356619156),
 (8, 0.00023449422563480077),
 (9, 0.0005907894686011224),
 (10, 0.0011534823914110166),
 (11, 0.0016716000307078246),
 (12, 0.0008885213857283079),
 (13, 0.0006122190456424552),
 (14, 0.000718552902536512),
 (15, 0.0010021763161985192),
 (16, 0.0002872155960091963),
 (17, 0.0006880861224886041),
 (18, 0.0005907894686011224),
 (19, 0.0007526118649316089),
 (20, 0.000660525567509399),
 (21, 0.000760413910157694),
 (22, 0.000718552902536512),
 (23, 0.00031768237605710425),
 (24, 0.0006508907640752898),
 (25, 0.0008885213857283079),
 (26, 0.00042312511680589536),
 (27, 0.0009530471281713128),
 (28, 0.0011534823914110166),
 (29, 0.002306964782822033),
 (30, 0.0015052237298632178),
 (31, 0.0011534823914110166),
 (32, 0.0008358000153539123),
 (33, 0.00025965504102

In [18]:
from gensim.models import LdaModel
logging.getLogger().setLevel(logging.DEBUG) # checking convergence
lda = LdaModel(doc_term_matrix, id2word=dictionary, num_topics=108, iterations=100, passes=10)
logging.getLogger().setLevel(logging.CRITICAL)

2018-08-10 16:36:41,893 : INFO : using symmetric alpha at 0.009259259259259259
2018-08-10 16:36:41,895 : INFO : using symmetric eta at 0.009259259259259259
2018-08-10 16:36:41,917 : INFO : using serial LDA version on this node
2018-08-10 16:36:43,423 : INFO : running online (multi-pass) LDA training, 108 topics, 10 passes over the supplied corpus of 108 documents, updating model once every 108 documents, evaluating perplexity every 108 documents, iterating 100x with a convergence threshold of 0.001000
2018-08-10 16:36:44,180 : DEBUG : bound: at document #0
2018-08-10 16:36:52,778 : INFO : -19.517 per-word bound, 750016.3 perplexity estimate based on a held-out corpus of 108 documents with 5390316 words
2018-08-10 16:36:52,780 : INFO : PROGRESS: pass 0, at document #108/108
2018-08-10 16:36:52,781 : DEBUG : performing inference on a chunk of 108 documents
2018-08-10 16:36:58,843 : DEBUG : 0/108 documents converged within 100 iterations
2018-08-10 16:36:58,890 : DEBUG : updating topics
2

2018-08-10 16:38:12,563 : INFO : topic #50 (0.009): 0.224*"argoshelpers" + 0.049*"argo" + 0.018*"marksandspencer" + 0.015*"postcode" + 0.009*"reservation" + 0.009*"catalogue" + 0.006*"georgia" + 0.006*"alyson" + 0.005*"xbox" + 0.005*"alice"
2018-08-10 16:38:12,567 : INFO : topic #54 (0.009): 0.130*"oppo" + 0.095*"oppocarein" + 0.057*"rishi" + 0.029*"f3" + 0.025*"pooja" + 0.016*"f1s" + 0.014*"f5" + 0.014*"nougat" + 0.012*"oppof5" + 0.011*"marshmallow"
2018-08-10 16:38:12,571 : INFO : topic #33 (0.009): 0.089*"cafe" + 0.074*"bakery" + 0.057*"askpanera" + 0.040*"boostcare" + 0.024*"sandwich" + 0.022*"salad" + 0.017*"chicken" + 0.015*"soup" + 0.013*"bread" + 0.013*"panera"
2018-08-10 16:38:12,575 : INFO : topic #55 (0.009): 0.010*"azuresupport" + 0.008*"askebay" + 0.006*"sainsburys" + 0.005*"gwrhelp" + 0.004*"seller" + 0.003*"airtel_care" + 0.003*"ebay" + 0.003*"safaricom_care" + 0.003*"azure" + 0.002*"arbyscares"
2018-08-10 16:38:12,636 : INFO : topic diff=inf, rho=0.408248
2018-08-10 16:

2018-08-10 16:39:43,294 : INFO : topic #49 (0.009): 0.160*"adobecare" + 0.059*"adobe" + 0.036*"lightroom" + 0.028*"sv" + 0.025*"aj" + 0.025*"raj" + 0.024*"photoshop" + 0.019*"looping" + 0.018*"nr" + 0.017*"premiere"
2018-08-10 16:39:43,298 : INFO : topic #45 (0.009): 0.134*"fargo" + 0.103*"ask_wellsfargo" + 0.049*"eb" + 0.031*"tl" + 0.015*"mh" + 0.014*"cl" + 0.014*"wellsfargo" + 0.013*"nc" + 0.013*"lc" + 0.012*"banker"
2018-08-10 16:39:43,301 : INFO : topic #58 (0.009): 0.010*"centurylinkhelp" + 0.008*"askplaystation" + 0.005*"asktarget" + 0.004*"applesupport" + 0.003*"centurylink" + 0.003*"walmart" + 0.002*"yngwie" + 0.002*"lexie" + 0.002*"ps4" + 0.002*"askpaypal"
2018-08-10 16:39:43,306 : INFO : topic #22 (0.009): 0.247*"tesco" + 0.026*"supplier" + 0.014*"barcode" + 0.010*"postcode" + 0.010*"packaging" + 0.006*"slot" + 0.006*"clubcard" + 0.006*"grocery" + 0.005*"chicken" + 0.005*"moneycard"
2018-08-10 16:39:43,362 : INFO : topic diff=inf, rho=0.301511


In [19]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda, doc_term_matrix, dictionary, mds="tsne")