In [1]:
# create a conversations (convo) table where each row represents
# a convo and each column some metric like average customer sentiment.

from compconvlistdict_tofrom_csv import load_comp_convlist_from_csv

comp_d = load_comp_convlist_from_csv()

In [2]:
comp, conv_list = list(comp_d.items())[0]

conv_locs = conv_list[0]
comp, conv_locs

('Ask_WellsFargo', [1757947, 1757946, 1757945])

In [3]:
#SLOW! load tweets

import pandas as pd

tweets_df = pd.read_csv("data/tweets_w_senti.csv").set_index("tweet_id")

In [4]:
tweets_df.loc[conv_locs]

Unnamed: 0_level_0,time,author_id,text,response_tweet_id,in_response_to_tweet_id,inbound,sentiment
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1757947,2010-02-16 15:49:47,529256,KTAR.com - Foreclosures still big problem in V...,1757946.0,,True,-0.5
1757946,2017-10-17 16:01:31,529255,@529256 @41702 Wells Fargo customer care is cu...,1757945.0,1757947.0,True,-0.3
1757945,2017-10-17 20:41:46,Ask_WellsFargo,"@529255 I'm sorry you feel this way about us, ...",,1757946.0,False,-0.5


In [5]:
# function that will take convolocs
# and return the 8 cols associated with it:
#     conv_id (num), company (str), tweet_ids (str), time (timestamp), 
#     avg_cust_sent (num), cust_sent_improvement (num), comp_resp_timehrs (num hours)
#     was_resolved (num bool)

from collections import defaultdict
import numpy as np
from pprint import pprint

def get_conv_metrics(conv_company_tup,
        tweets_df = tweets_df):
    
    conv_locs, company = conv_company_tup
    
    conv_df = tweets_df.loc[conv_locs]
    
    # don't process if conv initiated by company
    conv_aut_ids = list(conv_df["author_id"].values)
    initiator_id = conv_aut_ids[0]
    if initiator_id == company:
        return None
    
    else: # actually process
        res = defaultdict()
        
        # some low hanging fruit
        res["conv_id"] = conv_locs[0]
        res["company"] = company
        res["tweet_ids"] = ",".join(list(map(
            str, conv_locs
            )))
        res["time"] = conv_df.iloc[0]["time"]
        
        ## cust related ones
        
        # get cust_df
        cust_ilocs = list(filter(
            lambda ix: conv_aut_ids[ix] != company,
            range(len(conv_aut_ids))
            ))
        cust_df = conv_df.iloc[cust_ilocs]
        
        # avg_cust_sent
        sents = cust_df["sentiment"].values
        res["avg_cust_sent"] = np.mean(sents)
        
        # cust_sent_improvement
        imp = sents[-1] - sents[0]
        if len(sents) == 1:
            # in case customer just sent single
            # tweet, that is the change (improvement)
            imp = sents[-1]
        res["cust_sent_improvement"] = imp
        
        # issue_was_resolved if last tweet was positive
        res["issue_was_resolved"] = (sents[-1] > 0) * 1
            
        # company response time
        # get time of first company resp
        comp_resp_time = np.inf
        if company in conv_aut_ids: # the company responded
            comp_resp_row = conv_df.iloc[conv_aut_ids.index(company)]
            comp_resp_timestamp = \
                pd.to_datetime(comp_resp_row["time"])
            cust_init_timestamp = \
                pd.to_datetime(cust_df.iloc[0]["time"])
            comp_resp_time = (comp_resp_timestamp - cust_init_timestamp
                             ) / pd.Timedelta("1 hour")
        res["comp_resp_timehrs"] = comp_resp_time
        
        return res

conv_locs = conv_list[50]
get_conv_metrics((conv_locs, comp))

defaultdict(None,
            {'conv_id': 242309,
             'company': 'Ask_WellsFargo',
             'tweet_ids': '242309,242308,242307',
             'time': '2017-09-27 19:16:35',
             'avg_cust_sent': -0.3499999940395355,
             'cust_sent_improvement': -0.699999988079071,
             'issue_was_resolved': 0,
             'comp_resp_timehrs': 186.55694444444444})

In [6]:
# get conv, comp tups to feed to the metric getting function above

conv_comp_tups = []

for comp, convlist in comp_d.items():
    for conv in convlist:
        conv_comp_tups.append((conv, comp))
        
print("len(conv_comp_tups) =", len(conv_comp_tups))
conv_comp_tups[:2]

len(conv_comp_tups) = 794335


[([1757947, 1757946, 1757945], 'Ask_WellsFargo'),
 ([2984140, 2984139, 2984138], 'Ask_WellsFargo')]

In [None]:
# SLOW! getting list of rows for conversations_df

conv_metric_ds = list(filter(
    lambda d: d != None,
    map(
        get_conv_metrics,
        conv_comp_tups
        )
    ))

convs_df = pd.DataFrame.from_records(conv_metric_ds)[[
    "conv_id", "company", "tweet_ids", "time", "avg_cust_sent",
    "cust_sent_improvement", "comp_resp_timehrs", "issue_was_resolved"
    ]]
convs_df.to_csv("conversations.csv", index = False)
convs_df

In [10]:
pd.read_csv("data/conversations.csv", nrows = 5)

Unnamed: 0,conv_id,company,tweet_ids,time,avg_cust_sent,cust_sent_improvement,comp_resp_timehrs,issue_was_resolved
0,1757947,Ask_WellsFargo,175794717579461757945,2010-02-16 15:49:47,-0.4,0.2,67204.866389,0
1,2984140,Ask_WellsFargo,298414029841392984138,2011-10-17 18:00:43,-0.45,-0.9,53423.852222,0
2,255331,Ask_WellsFargo,"255331,255335,1769251,255339,255330,677776,677...",2012-09-25 20:04:10,-0.553846,-1.2,24.86,0
3,595098,Ask_WellsFargo,595098595096595097595099,2016-02-24 18:32:12,-0.3,0.6,1.870833,0
4,2153951,Ask_WellsFargo,21539512153949215394721539482153950,2016-03-22 23:19:04,-0.466667,-0.2,7032.025833,0
