In [1]:
## importing modules

%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from shutil import rmtree
from glob import glob
from fnmatch import fnmatch
from dill import load, dump
from pprint import pprint


In [2]:
## read in data

df_raw = pd.read_csv('twcs.csv', nrows=None)
print(df_raw.info())
df_raw.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2811774 entries, 0 to 2811773
Data columns (total 7 columns):
tweet_id                   int64
author_id                  object
inbound                    bool
created_at                 object
text                       object
response_tweet_id          object
in_response_to_tweet_id    float64
dtypes: bool(1), float64(1), int64(1), object(4)
memory usage: 131.4+ MB
None


Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0


In [116]:
## clean data a bit and SAVE

# convert 'created_at' to datetime format
df = df_raw[['created_at', 'author_id', 'text', 'tweet_id', 'response_tweet_id', 
         'in_response_to_tweet_id', 'inbound']].copy()
df['time'] = pd.to_datetime(df['created_at'], infer_datetime_format=True)

# rearrange columns and sort by time
df = df[['time', 'author_id', 'text', 'tweet_id', 'response_tweet_id', 
         'in_response_to_tweet_id', 'inbound']]
df.sort_values(by = "time", inplace = True)

# set tweet_id as index
df.set_index('tweet_id', inplace = True)

# standardize whitespace to avoid problems with saving as csv
df["text"] = df["text"].str.replace(r"\s+", " ", regex = True)

# save
df.to_csv("twcs_clean.csv")

In [None]:
## destroy df_raw
df_raw = None

In [3]:
### ENTRY POINT. load clean csv

df = pd.read_csv('twcs_clean.csv').set_index("tweet_id")
df["time"] = pd.to_datetime(df["time"], infer_datetime_format=True)
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2811774 entries, 790341 to 589938
Data columns (total 6 columns):
time                       datetime64[ns]
author_id                  object
text                       object
response_tweet_id          object
in_response_to_tweet_id    float64
inbound                    bool
dtypes: bool(1), datetime64[ns](1), float64(1), object(3)
memory usage: 131.4+ MB
None


Unnamed: 0_level_0,time,author_id,text,response_tweet_id,in_response_to_tweet_id,inbound
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
790341,2008-05-08 20:13:59,SouthwestAir,@34622 Have FUN at the lecture tonight! Tell P...,790326,790342.0,False
790326,2008-05-08 21:04:16,308466,@SouthwestAir Can you pls enter the HI market ...,"790327,790328,790325,790329,790330,790331,7903...",790341.0,True
1757947,2010-02-16 15:49:47,529256,KTAR.com - Foreclosures still big problem in V...,1757946,,True
2291020,2010-03-31 15:24:29,665443,@665445 Do you know if Carl's Jr serves lunch ...,2291018,,True
2291018,2010-03-31 16:53:27,CarlsJr,@665443 We serve lunch all day!,2291019,2291020.0,False


In [4]:
## make tweet chains for individual conversations

# save start_locs of convos
# a convo starts when 'in_response_to' is nan
def is_start_of_convo(rowtuple):
    
    # having a finite "in_response_to_tweet_id" 
    # means tweet is not convo starter
    if np.isfinite(
        getattr(rowtuple, "in_response_to_tweet_id")):
        return False
    
    else:
        return True
    
def get_start_locs(df = df):
    
    print("Computing convo_start_locs ...")
    
    # get the starter tups
    starter_tups_map = filter(is_start_of_convo,
                             df.itertuples())
    
    # get the locs of the starter tups
    tup_to_loc = lambda tup: tup.Index
    start_locs = list(map(tup_to_loc, starter_tups_map))
    
    return start_locs
    
convo_start_locs = get_start_locs()
        
print('Total convos =', len(convo_start_locs))
# print('convo_start_locs =', convo_start_locs)

# get the locs (and company) for a conversation given a start_loc
def get_convo_locs_n_comp(convo_start_loc, df = df):
    
    def get_next_locs(cur_locs, df = df):
        # recursively gets next locs
        
        next_locs = []
        for cur_loc in cur_locs:
            
            if cur_loc in df.index:
                resp_tw_ids_entry = df.loc[cur_loc]['response_tweet_id']
                
                if isinstance(resp_tw_ids_entry, str):
                    # next line splits up something like "45,65"
                    # in the response_tweet_id into a list of locs: [45, 65]
                    next_locs += list(map(int, resp_tw_ids_entry.split(",")))
                    
        if len(next_locs) > 0:
            next_locs += get_next_locs(next_locs)
        
        return next_locs
    
    convo_locs = [convo_start_loc] + get_next_locs([convo_start_loc])
    
    # take out non-existent locs
    convo_locs = list(filter(
        lambda loc: loc in df.index,
        convo_locs
        ))
    
    # make df and sort by time
    convo_df = df.loc[convo_locs]
    convo_df.sort_values(by = 'time', inplace = True)
    convo_locs = list(convo_df.index)
    
    # get company
    company = np.nan
    for rowtup in convo_df.itertuples():

        try:
            # if int, keep hunting
            _ = int(getattr(rowtup, "author_id"))
            
        except:
            # if not int, save
            company = getattr(rowtup, "author_id")
            break
    
    return convo_locs, company

start_loc = 18
convo_locs, company = get_convo_locs_n_comp(start_loc)
print("\nExample convo for \"" + company + "\":")
# print(df.loc[convo_locs][["author_id", "time", "text"]])

# temporary
print(df.loc[convo_locs][["text"]].values)

Computing convo_start_locs ...
Total convos = 794335

Example convo for "sprintcare":
[['@115714 y’all lie about your “great” connection. 5 bars LTE, still won’t load something. Smh.']
 ["@115713 H there! We'd definitely like to work with you on this, how long have you been experiencing this issue? -AA"]
 ['@sprintcare Since I signed up with you....Since day 1']
 ["@115713 We understand your concerns and we'd like for you to please send us a Direct Message, so that we can further assist you. -AA"]
 ['@sprintcare You gonna magically change your connectivity for me and my whole family ? 🤥 💯']
 ['@115713 This is saddening to hear. Please shoot us a DM, so that we can look into this for you. -KC']
 ["@115713 Hi, my name is Shantel, I'm a resolution supervisor here with Sprint. Your issues was brought to my attention. 1/2 -ResolutionSup SR"]
 ["@115713 I would really like to work with you to have this resolved. Kindly send us a DM. I'm here for you! -ResolutionSup SR"]]


In [121]:
## SLOW! get convo_loc_lists for all start_locs

# cll: convo_loc_list
cll_comp_tups = list(map(get_convo_locs_n_comp, convo_start_locs))
pprint(cll_comp_tups[:5])
print(" ...")

[([1757947, 1757946, 1757945], 'Ask_WellsFargo'),
 ([2291020, 2291018, 2291019], 'CarlsJr'),
 ([2651976, 2651975, 2651974, 2651973], 'JetBlue'),
 ([359955,
   359954,
   359953,
   359960,
   796821,
   359981,
   2163309,
   359983,
   2163310,
   2163311,
   2179300,
   2163312,
   359984,
   2944620,
   2944621,
   2944622,
   359957,
   492818],
  'AskPlayStation'),
 ([1985354, 1985353, 1985352], 'Ask_Spectrum')]
 ...


In [122]:
### group convos by company

from collections import defaultdict

# making a dict where item is (company: clll)
# clll: cll_list i.e. list of conversations ("cll"s)

comp_clll_dict = defaultdict(list)

for cll, comp in cll_comp_tups:
    comp_clll_dict[comp].append(cll)
    
# comp_clll_dict

In [123]:
### tool to save comp_clll_dict to csv

def save_locs_dict_to_csv(comp_clll_dict = comp_clll_dict):
    
    # [[1, 2], [3, 4, 5]] to "1,2 3,4,5"
    def list_list_to_str(ll):
        return " ".join(
            [",".join(list(map(str, l)))
                for l in ll]
            )
        
    clls_map = comp_clll_dict.values()
#     cll = list(comp_clll_dict.values())[0]
#     print()
    
    cll_strs = list(map(list_list_to_str, clls_map))
    
    # make df
    comp_cll_df = pd.DataFrame({
        "company": list(comp_clll_dict.keys()),
        "clll_str": cll_strs
        })
    
    comp_cll_df.to_csv("comp_cll_df.csv", index = False)
    
    return "comp_cll_df.csv"

print("comp_cll_df to csv:")
print(pd.read_csv(save_locs_dict_to_csv()).head())


comp_cll_df.csv:
          company                                           clll_str
0  Ask_WellsFargo  1757947,1757946,1757945 2984140,2984139,298413...
1         CarlsJr  2291020,2291018,2291019 2206745,2206744,220674...
2         JetBlue  2651976,2651975,2651974,2651973 2631045,263104...
3  AskPlayStation  359955,359954,359953,359960,796821,359981,2163...
4    Ask_Spectrum  1985354,1985353,1985352 151800,151799,151798 1...


In [127]:
### ENTRY POINT. tool to load comp_clll_dict from csv

def load_locs_dict_from_csv(path = "comp_cll_df.csv"):
    
    # "1,2 3,4,5" to [[1, 2], [3, 4, 5]]
    def str_to_ll(s):
        clstrs = s.split(" ")
        cll = [s.split(",") for s in clstrs]
        return cll
    
    # read csv
    df = pd.read_csv(path)
    comps = df["company"].values
    clll_strs = df["clll_str"]
    cllls_map = map(str_to_ll, clll_strs)
    
    # turn to dict
    comp_clll_dict = dict(zip(
        comps, cllls_map
        ))
    return comp_clll_dict

comp_clll_dict = load_locs_dict_from_csv()
print("comp_clll_dict_len:")
pprint(len(comp_clll_dict))

comp_clll_dict_len:
108
