In [1]:
# This notebook groups tweets into conversations and groups the
# conversations by company and saves this info in "data/comp_cll_df.csv"

# You need to first download twcs.zip (the raw data file containing
# all the tweets) from 
# https://www.kaggle.com/thoughtvector/customer-support-on-twitter/downloads/twcs.zip/10.
# You may have to create a free Kaggle account to do this.
# Extract twcs.csv to the data/ folder.

In [3]:
## importing modules

%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from shutil import rmtree
from glob import glob
from fnmatch import fnmatch
from dill import load, dump
from pprint import pprint


In [6]:
## read in data

df_raw = pd.read_csv('data/twcs.csv', nrows=1000)
print(df_raw.info())
df_raw.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
tweet_id                   1000 non-null int64
author_id                  1000 non-null object
inbound                    1000 non-null bool
created_at                 1000 non-null object
text                       1000 non-null object
response_tweet_id          679 non-null object
in_response_to_tweet_id    747 non-null float64
dtypes: bool(1), float64(1), int64(1), object(4)
memory usage: 47.9+ KB
None


Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0


In [7]:
## clean data a bit and SAVE

# convert 'created_at' to datetime format
df = df_raw[['created_at', 'author_id', 'text', 'tweet_id', 'response_tweet_id', 
         'in_response_to_tweet_id', 'inbound']].copy()
df['time'] = pd.to_datetime(df['created_at'], infer_datetime_format=True)

# rearrange columns and sort by time
df = df[['time', 'author_id', 'text', 'tweet_id', 'response_tweet_id', 
         'in_response_to_tweet_id', 'inbound']]
df.sort_values(by = "time", inplace = True)

# set tweet_id as index
df.set_index('tweet_id', inplace = True)

# standardize whitespace to avoid problems with saving as csv
df["text"] = df["text"].str.replace(r"\s+", " ", regex = True)

# save
df.to_csv("data/twcs_clean.csv")

In [8]:
### ENTRY POINT. load clean csv

df = pd.read_csv('data/twcs_clean.csv').set_index("tweet_id")
df["time"] = pd.to_datetime(df["time"], infer_datetime_format=True)
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 348 to 276
Data columns (total 6 columns):
time                       1000 non-null datetime64[ns]
author_id                  1000 non-null object
text                       1000 non-null object
response_tweet_id          679 non-null object
in_response_to_tweet_id    747 non-null float64
inbound                    1000 non-null bool
dtypes: bool(1), datetime64[ns](1), float64(1), object(3)
memory usage: 47.9+ KB
None


Unnamed: 0_level_0,time,author_id,text,response_tweet_id,in_response_to_tweet_id,inbound
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
348,2011-08-29 03:20:05,115798,"Lost your booking number? No worries, just dro...","349,350,351,352,347,353,354,355,356,357,358,35...",,True
611,2016-08-06 01:31:50,115818,@DELTA i booked my flight using delta amex car...,609,,True
609,2016-08-06 01:44:03,Delta,"@115818 Glad to check. Pls, DM your confirmati...",610,611.0,False
752,2017-10-12 10:11:50,115867,@AppleSupport my Apple TV works fine with my p...,751,,True
293,2017-10-18 14:07:45,115769,Whoa! Come along with Lightroom’s own Ben Ward...,292,,True


In [9]:
## make tweet chains for individual conversations

# save start_locs of convos
# a convo starts when 'in_response_to' is nan
def is_start_of_convo(rowtuple):
    
    # having a finite "in_response_to_tweet_id" 
    # means tweet is not convo starter
    if np.isfinite(
        getattr(rowtuple, "in_response_to_tweet_id")):
        return False
    
    else:
        return True
    
def get_start_locs(df = df):
    
    print("Computing convo_start_locs ...")
    
    # get the starter tups
    starter_tups_map = filter(is_start_of_convo,
                             df.itertuples())
    
    # get the locs of the starter tups
    tup_to_loc = lambda tup: tup.Index
    start_locs = list(map(tup_to_loc, starter_tups_map))
    
    return start_locs
    
convo_start_locs = get_start_locs()
        
print('Total convos =', len(convo_start_locs))
# print('convo_start_locs =', convo_start_locs)

# get the locs (and company) for a conversation given a start_loc
def get_convo_locs_n_comp(convo_start_loc, df = df):
    
    def get_next_locs(cur_locs, df = df):
        # recursively gets next locs
        
        next_locs = []
        for cur_loc in cur_locs:
            
            if cur_loc in df.index:
                resp_tw_ids_entry = df.loc[cur_loc]['response_tweet_id']
                
                if isinstance(resp_tw_ids_entry, str):
                    # next line splits up something like "45,65"
                    # in the response_tweet_id into a list of locs: [45, 65]
                    next_locs += list(map(int, resp_tw_ids_entry.split(",")))
                    
        if len(next_locs) > 0:
            next_locs += get_next_locs(next_locs)
        
        return next_locs
    
    convo_locs = [convo_start_loc] + get_next_locs([convo_start_loc])
    
    # take out non-existent locs
    convo_locs = list(filter(
        lambda loc: loc in df.index,
        convo_locs
        ))
    
    # make df and sort by time
    convo_df = df.loc[convo_locs]
    convo_df.sort_values(by = 'time', inplace = True)
    convo_locs = list(convo_df.index)
    
    # get company
    company = np.nan
    for rowtup in convo_df.itertuples():

        try:
            # if int, keep hunting
            _ = int(getattr(rowtup, "author_id"))
            
        except:
            # if not int, save
            company = getattr(rowtup, "author_id")
            break
    
    return convo_locs, company

start_loc = 18
convo_locs, company = get_convo_locs_n_comp(start_loc)
print("\nExample convo for \"" + company + "\":")
# print(df.loc[convo_locs][["author_id", "time", "text"]])

# temporary
print(df.loc[convo_locs][["text"]].values)

Computing convo_start_locs ...
Total convos = 253

Example convo for "sprintcare":
[['@115714 y’all lie about your “great” connection. 5 bars LTE, still won’t load something. Smh.']
 ["@115713 H there! We'd definitely like to work with you on this, how long have you been experiencing this issue? -AA"]
 ['@sprintcare Since I signed up with you....Since day 1']
 ["@115713 We understand your concerns and we'd like for you to please send us a Direct Message, so that we can further assist you. -AA"]
 ['@sprintcare You gonna magically change your connectivity for me and my whole family ? 🤥 💯']
 ['@115713 This is saddening to hear. Please shoot us a DM, so that we can look into this for you. -KC']]


In [10]:
## SLOW! get convo_loc_lists for all start_locs

# cll: convo_loc_list
cll_comp_tups = list(map(get_convo_locs_n_comp, convo_start_locs))
pprint(cll_comp_tups[:5])
print(" ...")

[([348,
   347,
   343,
   334,
   344,
   375,
   374,
   332,
   333,
   335,
   336,
   337,
   338,
   339,
   340,
   341,
   342],
  'AirAsiaSupport'),
 ([611, 609, 610], 'Delta'),
 ([752, 751, 750], 'AppleSupport'),
 ([293, 292, 291], 'AdobeCare'),
 ([614, 612, 613], 'McDonalds')]
 ...


In [11]:
### group convos by company

from collections import defaultdict

# making a dict where item is (company: clll)
# clll: cll_list i.e. list of conversations ("cll"s)

comp_clll_dict = defaultdict(list)

for cll, comp in cll_comp_tups:
    comp_clll_dict[comp].append(cll)
    
comp_clll_dict

defaultdict(list,
            {'AirAsiaSupport': [[348,
               347,
               343,
               334,
               344,
               375,
               374,
               332,
               333,
               335,
               336,
               337,
               338,
               339,
               340,
               341,
               342],
              [371, 369, 370, 368],
              [367, 366, 364, 362, 363, 365]],
             'Delta': [[611, 609, 610],
              [801, 799, 796, 794, 795, 797, 798],
              [814, 813, 811, 810, 809, 807, 808, 804, 805, 806],
              [817,
               815,
               816,
               818,
               819,
               820,
               821,
               833,
               832,
               847,
               823,
               822,
               829,
               834,
               828,
               835,
               827,
               826,
               825,
   

In [13]:
### tool to save comp_clll_dict to csv

def save_locs_dict_to_csv(comp_clll_dict = comp_clll_dict):
    
    # [[1, 2], [3, 4, 5]] to "1,2 3,4,5"
    def list_list_to_str(ll):
        return " ".join(
            [",".join(list(map(str, l)))
                for l in ll]
            )
        
    clls_map = comp_clll_dict.values()
#     cll = list(comp_clll_dict.values())[0]
#     print()
    
    cll_strs = list(map(list_list_to_str, clls_map))
    
    # make df
    comp_cll_df = pd.DataFrame({
        "company": list(comp_clll_dict.keys()),
        "clll_str": cll_strs
        })
    
    comp_cll_df.to_csv("data/comp_cll_df.csv", index = False)
    
    return "data/comp_cll_df.csv"

print("comp_cll_df to csv:")
print(pd.read_csv(save_locs_dict_to_csv()).head())


comp_cll_df to csv:
          company                                           clll_str
0  AirAsiaSupport  348,347,343,334,344,375,374,332,333,335,336,33...
1           Delta  611,609,610 801,799,796,794,795,797,798 814,81...
2    AppleSupport  752,751,750 747,746,745,744,740,738,739,741,74...
3       AdobeCare  293,292,291 290,289,288,287,285,284 283,282 27...
4       McDonalds            614,612,613 409,410,408,487,599,598,407


In [14]:
### ENTRY POINT. tool to load comp_clll_dict from csv

def load_locs_dict_from_csv(path = "data/comp_cll_df.csv"):
    
    # "1,2 3,4,5" to [[1, 2], [3, 4, 5]]
    def str_to_ll(s):
        clstrs = s.split(" ")
        cll = [s.split(",") for s in clstrs]
        return cll
    
    # read csv
    df = pd.read_csv(path)
    comps = df["company"].values
    clll_strs = df["clll_str"]
    cllls_map = map(str_to_ll, clll_strs)
    
    # turn to dict
    comp_clll_dict = dict(zip(
        comps, cllls_map
        ))
    return comp_clll_dict

comp_clll_dict = load_locs_dict_from_csv()
print("comp_clll_dict_len:")
pprint(len(comp_clll_dict))

comp_clll_dict_len:
43
