### Create Multi-Label Dataset

In [1]:
from pprint import pprint
import os
import math
import pandas as pd

In [11]:
PROPER_LABELS = [
    "Agent_CallPurpose",
    "Agent_ChildEducation",
    "Agent_CloseDeal",
    "Agent_CompanyIntroduction",
    "Agent_EMI",
    "Agent_Event",
    "Agent_ExplainDocsRequired",
    "Agent_ExplainMoneyTranferProcess",
    "Agent_FamilyShopping",
    "Agent_GoodClose",
    "Agent_Greetings",
    "Agent_HouseFixing",
    "Agent_InformCallBack",
    "Agent_InsuranceFee",
    "Agent_InterestRate",
    "Agent_ListeningSkill",
    "Agent_LoanAmount",
    "Agent_LoanDuration",
    "Agent_MentioningDisclaimer",
    "Agent_OH_AmountLess",
    "Agent_OH_BadExperience",
    "Agent_OH_CustAskForCallingBack",
    "Agent_OH_DiscussWithFamily",
    "Agent_OH_HighInterest",
    "Agent_OH_HighLoanDuration",
    "Agent_OH_NotRequireInsurance",
    "Agent_OH_NotRequireLoan",
    "Agent_RunBusiness",
    "Agent_SelfIntroduction",
    "Agent_Summarize",
    "Agent_Thanks",
    "Agent_Vacation",
    "Agent_VerifyCustomerName",
    "Client_AgreeToTakeLoan",
    "Client_AmountLess",
    "Client_AskForCallingBack",
    "Client_BadExperience",
    "Client_Confirmation",
    "Client_DiscussWithFamily",
    "Client_HighInterest",
    "Client_HighLoanDuration",
    "Client_NotRequireLoan"
]

print(f"Number of labels: {len(PROPER_LABELS)}")

Number of labels: 42


In [9]:
df = pd.read_excel("./data/telesale/100_calls.xlsx",
                   sheet_name="100 calls",
                   engine="openpyxl")

In [10]:
df.head()

Unnamed: 0,fileName,start,end,speaker,sentiment,text,Intent,Unnamed: 7,Unnamed: 8
0,VUQC83EOO969H66VRLO5RMEPC8089Q00_2021-07-16_11...,0.81,1.27,0.0,3.0,a lô,,,
1,,1.86,2.76,0.0,3.0,anh đơn hả anh,Agent_VerifyCustomerName,,
2,,4.62,7.86,0.0,3.0,ờ anh em là hương nè em gọi đến cho anh từ bên...,Agent_Self Introduction,Agent_CompanyIntroduction,
3,,8.04,34.32,0.0,3.0,a lô cho em hỏi xíu ha là vợ anh chị lép á còn...,Agent_VerifyCustomerName,,
4,,34.68,35.07,0.0,3.0,anh anh,,,


In [15]:
""" Get samples (text-based utterance)
"""
texts_df = df[["text"]]
texts_df = texts.rename({"text": "samples"}, axis="columns")
texts_df.head()

Unnamed: 0,samples
0,a lô
1,anh đơn hả anh
2,ờ anh em là hương nè em gọi đến cho anh từ bên...
3,a lô cho em hỏi xíu ha là vợ anh chị lép á còn...
4,anh anh


In [105]:
""" Get labels
"""
labels_df = df[df.columns[6:]]
labels_df = labels_df.rename({
    "Intent": "label 1",
    "Unnamed: 7": "label 2",
    "Unnamed: 8": "label 3"}, axis="columns")
labels_df = labels_df.fillna(-1)
labels_df.head()

Unnamed: 0,label 1,label 2,label 3
0,-1,-1,-1
1,Agent_VerifyCustomerName,-1,-1
2,Agent_Self Introduction,Agent_CompanyIntroduction,-1
3,Agent_VerifyCustomerName,-1,-1
4,-1,-1,-1


In [127]:
""" Remove OOD uterrances from original data
"""
samples = []
labels = []

# Get samples
samples = texts_df["samples"].tolist()

# Get labels
for idx, row in labels_df.iterrows():
    labels.append(row.tolist())

print(f"# samples: {len(samples)} (w/ OOD text)")

# Remove `ood` utterances
tmp_samples = []
tmp_labels = []
for sample, label in zip(samples, labels):
    if label[0] != -1 and isinstance(sample, str):
        if "Agent" in label[0] or "Client" in label[0]:
            label_list = [l for l in label if l != -1]
            tmp_samples.append(sample)
            tmp_labels.append(label_list)

samples = tmp_samples
labels = tmp_labels
assert len(samples) == len(labels)

print(f"# samples: {len(samples)} (w/o OOD text)")

# samples: 9368 (w/ OOD text)
# samples: 1117 (w/o OOD text)


In [149]:
""" Correct labels
"""
for label in labels:
    for idx, l in enumerate(label):
        if l == "Agent_Self Introduction":
            label[idx] = "Agent_SelfIntroduction"
        elif l == "Agent_ThanksClosing":
            label[idx] = "Agent_Thanks"
        else:
            pass

In [150]:
""" Count number of multilabel samples in dataset
"""
mlt_samples = 0
for label in labels:
    if len(label) > 1:
        mlt_samples += 1
print(f"# multi-label samples: {mlt_samples}")

# multi-label samples: 107


In [167]:
""" Sanity checking (labels)
"""
unique_labels = set()
for label in labels:
    for l in label:
        if l not in unique_labels:
            unique_labels.add(l)
unique_labels = list(unique_labels)

error_flag = False
for label in unique_labels:
    try:
        assert label in PROPER_LABELS
    except:
        print(f"FAIL: {label}")
        error_flag = True
if error_flag:
    print("FAIL\n")
else:
    print("PASS\n")

pprint(f"Unique labels: {unique_labels}")
print(f"\n# Labels: {len(unique_labels)}")

PASS

("Unique labels: ['Client_BadExperience', 'Agent_FamilyShopping', "
 "'Client_AmountLess', 'Agent_CompanyIntroduction', 'Agent_LoanAmount', "
 "'Agent_OH_HighInterest', 'Agent_ChildEducation', 'Agent_EMI', "
 "'Agent_HouseFixing', 'Agent_GoodClose', 'Agent_CallPurpose', "
 "'Client_DiscussWithFamily', 'Agent_OH_CustAskForCallingBack', "
 "'Agent_MentioningDisclaimer', 'Agent_VerifyCustomerName', 'Agent_Greetings', "
 "'Agent_SelfIntroduction', 'Agent_OH_NotRequireInsurance', "
 "'Agent_InterestRate', 'Client_HighInterest', 'Agent_OH_BadExperience', "
 "'Agent_RunBusiness', 'Agent_LoanDuration', 'Client_NotRequireLoan', "
 "'Agent_OH_HighLoanDuration', 'Client_AskForCallingBack', "
 "'Agent_ExplainMoneyTranferProcess', 'Client_HighLoanDuration', "
 "'Agent_OH_NotRequireLoan', 'Agent_OH_AmountLess', 'Agent_InformCallBack', "
 "'Agent_OH_DiscussWithFamily', 'Agent_Thanks', 'Agent_ListeningSkill', "
 "'Agent_InsuranceFee', 'Agent_ExplainDocsRequired', 'Agent_Summarize']")

# Labels: 

In [173]:
""" Need to add samples for these labels
"""
tmp = PROPER_LABELS.copy()
for label in unique_labels:
    tmp.remove(label)

print("Labels don't have samples:")
pprint(tmp)

Labels don't have samples:
['Agent_Event',
 'Agent_Vacation',
 'Client_AgreeToTakeLoan',
 'Client_Confirmation']


In [161]:
""" Write to disk
"""
text_path = "./data/telesale/processed_100_calls/texts.txt"
label_path = "./data/telesale/processed_100_calls/labels.txt"

# os.makedirs(text_path, exist_ok=True)
# os.makedirs(label_path, exist_ok=True)

with open(text_path, "w") as f:
    for sample in samples:
        f.write(sample)
        f.write("\n")

with open(label_path, "w") as f:
    for label in labels:
        line = ",".join(label)
        line += "\n"
        f.write(line)

### Add More Data

In [3]:
""" Load data from local file
"""
data_path = "./data/telesale/single_intent.xlsx"
df = pd.read_excel(data_path, engine="openpyxl", sheet_name=3)
df.head()

Unnamed: 0,No.,Sample,Intent,Entity,Notes
0,1.0,em tên là hà,Agent_SelfIntroduction,,
1,2.0,em tên là hải,Agent_SelfIntroduction,,
2,3.0,em hà đây,Agent_SelfIntroduction,,
3,4.0,em quý anh ơi,Agent_SelfIntroduction,,
4,5.0,em là hương nhân viên bên,Agent_SelfIntroduction,,


In [22]:
def label_checking(real_labels, true_labels):
    """ Check if real labels matching or not with
    predefined labels
    """
    error_flag = False
    labels = []
    for label in real_labels:
        try:
            assert label in true_labels
        except:
            labels.append(label)
            error_flag = True
    if error_flag:
        print(f"FAIL {labels}\n")
    else:
        print("\tPASS\n")

In [23]:
""" Get texts and corresponding labels
"""
samples = df["Sample"].tolist()
labels = df["Intent"].tolist()

unique_labels = set(labels)

print("=== LABEL CHECKING ===")
label_checking(unique_labels, PROPER_LABELS)

=== LABEL CHECKING ===
	PASS



In [27]:
""" Write to disk
"""
texts_path = "./data/telesale/single_intent/texts.txt"
labels_path = "./data/telesale/single_intent/labels.txt"

os.makedirs(os.path.dirname(texts_path), exist_ok=True)

with open(texts_path, "w") as f:
    for sample in samples:
        f.write(sample)
        f.write("\n")

with open(labels_path, "w") as f:
    for label in labels:
        f.write(label)
        f.write("\n")

### Merge single + multi data

In [38]:
multi_samples_path = "./data/telesale/processed_100_calls/texts.txt"
multi_labels_path = "./data/telesale/processed_100_calls/labels.txt"
single_samples_path = "./data/telesale/single_intent/texts.txt"
single_labels_path = "./data/telesale/single_intent/labels.txt"

single_samples = []
single_labels = []
multi_samples = []
multi_labels = []

with open(single_samples_path, "r") as f:
    lines = f.readlines()
    for line in lines:
        single_samples.append(line.strip())

with open(single_labels_path, "r") as f:
    lines = f.readlines()
    for line in lines:
        single_labels.append(line.strip())

with open(multi_samples_path, "r") as f:
    lines = f.readlines()
    for line in lines:
        multi_samples.append(line.strip())

with open(multi_labels_path, "r") as f:
    lines = f.readlines()
    for line in lines:
        multi_labels.append(line.strip())

# merge
all_samples = [*single_samples, *multi_samples]
all_labels = [*single_labels, *multi_labels]
assert len(all_samples) == len(all_labels)


all_samples_path = "./data/telesale/merged_data/texts.txt"
all_labels_path = "./data/telesale/merged_data/labels.txt"

os.makedirs(os.path.dirname(all_samples_path), exist_ok=True)

with open(all_samples_path, "w") as f:
    for sample in all_samples:
        f.write(sample)
        f.write("\n")

with open(all_labels_path, "w") as f:
    for label in all_labels:
        f.write(label)
        f.write("\n")