In [1]:
import pandas as pd 
import os 
from dotenv import load_dotenv
load_dotenv()

os.chdir("..")


In [2]:
import re
# universal functions
def cleanText (text):
    """ 
    process: 
    - lowercase
    - remove trailing spaces
    - remove special characters and punction
    """
    text = text.lower()
    text = text.strip()
    text_ = re.sub(r"[^a-zA-Z0-9 ]+","", text)
    
    return text_

In [3]:
# file for category mapping
cat_file = "./data/Rocks_N_Ropes_Chat_2022-06-21v2.csv"
chat_file = "./data/RnR Chat 2021-11-11.csv"


In [4]:
cat_df = pd.read_csv(cat_file,sep=",")
chat_df = pd.read_csv(chat_file,sep=",")

display(cat_df.head())
display(chat_df.head())

Unnamed: 0,Id,Category,Topic,Keyword
0,0,Emotion,Mad,I'm _ mad
1,1,Emotion,Mad,sucks
2,2,Emotion,Mad,I hate
3,3,Emotion,Mad,not _ happy
4,4,Emotion,Mad,pisses me off


Unnamed: 0,CustomerID,contact_type,site_down,sale_accept,site_down_sentiment,product_name,product_family,order_identifier,carrier,customer_state,...,c_address_response,a_shipping_prompt_2,c_shipping_response_2,a_order_confirmation,a_return_instructions,c_customer_email,a_label_notice,a_agent_closing_start,c_customer_closing_response,a_agent_closing_wrap
0,1966,Sales Order,0,1,9,Fiction jacket,clothing,email,FedEx,Ohio,...,7 Mitchell Parkway Columbus Ohio 43231,Use the same address for mailing?,"Yes, thanks",You will receive a confirmation email in the n...,9,9,9,Is that all you needed today?,"No, and I won't be shopping here again","Well, thanks for contacting R and R"
1,1403,Sales Order,0,1,9,UltraSham heavy jacket,clothing,order number,USPS,North Carolina,...,491 Thierer Alley Charlotte North Carolina 28210,Use the same address for shipping?,Yes,You will receive a confirmation email in the n...,9,9,9,Is there anything else I can do to assist you?,"No, thanks",Thank you for contacting Rocks and Ropes
2,2402,Return,0,0,9,Mountbank vest,clothing,order number,UPS,Florida,...,9,9,9,9,"To process your return, I'm going to send you ...",rbiggins97@ocn.ne.jp,Thank you for that. You should receive the ret...,Do you need any other assistance?,"No, thanks for doing such a great job","Well, thank you for contacting RnR"
3,1209,Return,0,1,9,Fiction jacket,clothing,order number,UPS,California,...,9,9,9,9,"To process your return, I'm going to send you ...",msumsion7c@liveinternet.ru,Thank you for that. You should receive the ret...,Do you need any other assistance?,"Yes, but you can't fix it",9
4,2447,Sales Order,0,1,9,Charlatan sleeping bag,sleeping bag,order number,UPS,Indiana,...,46 Melrose Junction Indianapolis Indiana 46266,Would you like us to ship to that address too?,They are,You will receive a confirmation email in the n...,9,9,9,Can I help with anything else?,"No, I appreciated your assistance",Thanks for contacting Rocks and Ropes


In [5]:
# get relevant columns to rebuild the transcript 
"""
any column with a_ or c_ will be the column with words if not 9

assuming the column orders are serial in nature in building - then the transcript should be in order - tho it will not mater for the purpose of identifying the key words
"""
chat_cols = chat_df.columns
agent_trans_cols = [col for col in chat_cols if "a_" in col]
cust_trans_cols = [col for col in chat_cols if "c_" in col]

# build the full text ? from the row ?  kind of slow but easier process to write

def buildTranscripts(df,cols):
    full_array = []
    for i, row in df.iterrows():
        text = []
        for col in cols:
            if row[col] !="9": 
                text.append(row[col])
            else:
                continue
        
        full_array.append(" ".join(text)) # join it back
    return full_array

agent_array = buildTranscripts(chat_df,agent_trans_cols)
customer_array =  buildTranscripts(chat_df,cust_trans_cols)
chat_df["agent_transcripts"] = agent_array # processed in serial order should align
chat_df["customer_transcripts"] = customer_array # processed in serial order should align
chat_df.head()

# save off so jeff doesn't have to reprocess this in the future
chat_df.to_csv("./data/rnr_chat_w_transcript.csv", sep=",")


In [6]:
cat_df.head()

Unnamed: 0,Id,Category,Topic,Keyword
0,0,Emotion,Mad,I'm _ mad
1,1,Emotion,Mad,sucks
2,2,Emotion,Mad,I hate
3,3,Emotion,Mad,not _ happy
4,4,Emotion,Mad,pisses me off


In [7]:
# clean category df

cat_df["Keyword_clean"] = cat_df["Keyword"].apply(lambda x: x.strip().lower()) # don't remove special as it will be indicator later
cat_df["keyword_list"] = cat_df["Keyword_clean"].apply(lambda x: list(t.strip() for t in x.split("_")) if "_" in x else [])

cat_df.head()

Unnamed: 0,Id,Category,Topic,Keyword,Keyword_clean,keyword_list
0,0,Emotion,Mad,I'm _ mad,i'm _ mad,"[i'm, mad]"
1,1,Emotion,Mad,sucks,sucks,[]
2,2,Emotion,Mad,I hate,i hate,[]
3,3,Emotion,Mad,not _ happy,not _ happy,"[not, happy]"
4,4,Emotion,Mad,pisses me off,pisses me off,[]


# Assumptions

exact match except some special characters are ignored ( . , !, etc.)  2. More than 1 Topic can be applied to a conversation if multiple Topics result in keyword hits. 3. The results are not applied based on Frequency. 4) the scan is performed across the entire conversation (transcript) and the results are reported at the conversation level. The app does highlight all the instances of the keyword / topic hits within the conversation when reviewing the transcript


# process flow 
- build transcript based on columns 
- for each transcript
    - preprocess 
    - tag topic if initial key word match is found w

- append transcript 
- create tag table 
    - transcript Id 
    - chat_id 
    - tag 
    (eessentially for sake of easy processing explode the df on tag array so can process in excel more easily for tying metrics back to CXI)
    


In [92]:
from dask.distributed import Client, LocalCluster

cluster = LocalCluster("127.0.0.1:8786", n_workers=8, threads_per_worker=8)
client = Client(cluster)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 54906 instead


In [97]:
# build tag array for each transcript 
import re
import dask.dataframe as dd
from dask.dataframe import utils
import itertools
import numpy as np

"""
category process is taking awhile so will parallelize this
"""

# if this takes awhile - write this for parallel processing
def buildTags(id,transcript,mapping_df):
    id_array = [] # collect the - will map back later
    def topicMatch(trans, t_row):
        if "_" in t_row.Keyword: 
            counter = 0
            for token in t_row.keyword_list:
                if token in trans.split():
                    counter = counter + 1
            
            if counter >= len(t_row.Keyword):
                id_array.append(t_row.Id)
        else:
            if re.search(r'\b{}\b'.format(t_row.Keyword),trans):
                id_array.append(t_row.Id)

        return id_array
    
    topic_id_array = mapping_df.apply(lambda x: topicMatch(transcript,x),axis=1) # apply across the category map
    topic_array = list(set(itertools.chain.from_iterable(topic_id_array))) # set to consolidated list 

    if len(topic_array) == 0:
        return None
    else:
        id_ = [id] * len(topic_array)
        df_ = pd.DataFrame(data=zip(id_,topic_array),columns=["chat_id","topic_id_array"])
        return df_


chat_df["customer_transcripts_clean"] = chat_df.customer_transcripts.apply(lambda x: cleanText(x))

# create distributable dataframe 
chat_dd = dd.from_pandas(chat_df.head(10),npartitions=8)

df_list = []

meta = [("chat_id",int),("topic_id_array",int)]
chat_dd_ = chat_dd.map_partitions(lambda df: df.apply(lambda x: df_list.append(buildTags(x.chat_number,x.customer_transcripts_clean,cat_df)),axis=1),meta=meta)
chat_dd_.compute(scheduler="processes")
print('completed customer transcripts')
#chat_dd.persist()
#chat_df[["chat_number","customer_transcripts_clean"]].head(1000).apply(lambda x: df_list.append(buildTags(x.chat_number,x.customer_transcripts_clean,cat_df)),axis=1)



TypeError: can only concatenate str (not "traceback") to str

In [91]:
blah = pd.concat(df_list)
blah.shape

(3886, 2)

In [94]:
client.close()