In [128]:
from pprint import pprint
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from pyvi import ViTokenizer

In [149]:
""" Load data
"""
samples_path = "./data/telesale/merged_data/texts.txt"
labels_path = "./data/telesale/merged_data/labels.txt"

samples = []
labels = []

with open(samples_path, "r") as f:
    lines = f.readlines()
    for line in lines:
        samples.append(line.strip())

with open(labels_path, "r") as f:
    lines = f.readlines()
    for line in lines:
        labels.append(line.strip())

In [150]:
""" Feature Extraction
"""
def _tokenize(text):
    return ViTokenizer.tokenize(text).split()

vectorizer = TfidfVectorizer(
    token_pattern=r"(?u)\b\w+\b",
    tokenizer=_tokenize,
    ngram_range=(1, 2),
    sublinear_tf=True
)
X = vectorizer.fit_transform(samples)

In [136]:
""" Prepare multilabel format for training with Sklearn
"""
# Get unique labels from dataset
classes = set()
for label in labels:
    tmp_lbl = label.split(",")
    for t in tmp_lbl:
        classes.add(t)
classes = list(classes)

# List[List] --> List[Tuple]
# create a copy of `labels` variable
mlb_labels = []
for label in labels:
    tmp_labels = label.split(",")
    mlb_labels.append(tuple(tmp_labels))

mlb = MultiLabelBinarizer(classes=classes)
y = mlb.fit_transform(mlb_labels)

In [146]:
""" Build classifier which enables multilabel capability
"""
RANDOM_STATE = 42
NUM_CPU = 8
estimator = SVC(kernel='linear',
                random_state=RANDOM_STATE,
                verbose=1, probability=True)
clf = MultiOutputClassifier(estimator, n_jobs=NUM_CPU)

In [153]:
clf.fit(X, y)

*
optimization finished, #iter = 217
obj = -20.172842, rho = 0.977893
nSV = 152, nBSV = 15
Total nSV = 152
*
optimization finished, #iter = 250
obj = -20.793150, rho = 0.969284
nSV = 184, nBSV = 13
Total nSV = 184
*
optimization finished, #iter = 529
obj = -69.775221, rho = 1.175727
nSV = 290, nBSV = 52
Total nSV = 290
*
optimization finished, #iter = 413
obj = -37.925555, rho = 0.993085
nSV = 284, nBSV = 26
Total nSV = 284
*
optimization finished, #iter = 614
obj = -60.479949, rho = 0.962397
nSV = 376, nBSV = 42
Total nSV = 376
*
optimization finished, #iter = 401
obj = -55.978730, rho = 1.065576
nSV = 248, nBSV = 47
Total nSV = 248
*
optimization finished, #iter = 174
obj = -22.492233, rho = 0.989986
nSV = 133, nBSV = 18
Total nSV = 133
*
optimization finished, #iter = 566
obj = -44.180222, rho = 0.962205
nSV = 372, nBSV = 32
Total nSV = 372
*
optimization finished, #iter = 361
obj = -25.770421, rho = 1.070202
nSV = 226, nBSV = 18
Total nSV = 226
*
optimization finished, #iter = 828


*
optimization finished, #iter = 243
obj = -26.199909, rho = -1.091932
nSV = 156, nBSV = 19
Total nSV = 156
*
optimization finished, #iter = 313
obj = -25.378313, rho = 1.021224
nSV = 228, nBSV = 17
Total nSV = 228
*
optimization finished, #iter = 329
obj = -42.182420, rho = 1.124406
nSV = 195, nBSV = 34
Total nSV = 195
*
optimization finished, #iter = 332
obj = -27.531613, rho = 1.073535
nSV = 218, nBSV = 22
Total nSV = 218
*
optimization finished, #iter = 259
obj = -15.393689, rho = 0.997574
nSV = 186, nBSV = 10
Total nSV = 186
*
optimization finished, #iter = 915
obj = -96.729837, rho = -0.938394
nSV = 547, nBSV = 69
Total nSV = 547
*
optimization finished, #iter = 386
obj = -15.532390, rho = 0.983431
nSV = 285, nBSV = 11
Total nSV = 285
*
optimization finished, #iter = 358
obj = -15.691388, rho = 0.983302
nSV = 265, nBSV = 11
Total nSV = 265
*
optimization finished, #iter = 252
obj = -21.470126, rho = -1.004607
nSV = 178, nBSV = 18
Total nSV = 178
*
optimization finished, #iter = 3

*
optimization finished, #iter = 593
obj = -72.901448, rho = 1.134648
nSV = 337, nBSV = 56
Total nSV = 337
*
optimization finished, #iter = 557
obj = -67.196092, rho = 0.991841
nSV = 376, nBSV = 56
Total nSV = 376
*
optimization finished, #iter = 201
obj = -22.787825, rho = -1.035777
nSV = 138, nBSV = 18
Total nSV = 138
*
optimization finished, #iter = 581
obj = -27.565009, rho = 0.976489
nSV = 409, nBSV = 19
Total nSV = 409
*
optimization finished, #iter = 308
obj = -53.381787, rho = 1.173744
nSV = 190, nBSV = 47
Total nSV = 190
*
optimization finished, #iter = 412
obj = -37.675896, rho = -0.991476
nSV = 275, nBSV = 29
Total nSV = 275
*
optimization finished, #iter = 607
obj = -40.705617, rho = 0.965170
nSV = 374, nBSV = 28
Total nSV = 374
*
optimization finished, #iter = 342
obj = -55.337014, rho = 1.191103
nSV = 200, nBSV = 49
Total nSV = 200
*
optimization finished, #iter = 401
obj = -18.485329, rho = 0.983020
nSV = 282, nBSV = 12
Total nSV = 282
*
optimization finished, #iter = 26

*
optimization finished, #iter = 689
obj = -108.136933, rho = 1.341242
nSV = 376, nBSV = 85
Total nSV = 376
*
optimization finished, #iter = 807
obj = -73.918973, rho = -0.977918
nSV = 479, nBSV = 59
Total nSV = 479
*
optimization finished, #iter = 696
obj = -115.519673, rho = 1.351657
nSV = 409, nBSV = 94
Total nSV = 409
*
optimization finished, #iter = 795
obj = -130.178844, rho = -1.396409
nSV = 439, nBSV = 107
Total nSV = 439


MultiOutputClassifier(estimator=SVC(kernel='linear', probability=True,
                                    random_state=42, verbose=1),
                      n_jobs=8)

In [171]:
test_x = ["đúng rồi là mình góp á là hàng tháng chỉ góp là hai triệu ba trăm hai lăm ngàn em hỗ trợ cho mình góp trong vòng mười hai tháng thôi còn nếu mà chị đóng lãi là hàng tháng chị đóng là bảy trăm chín mươi hai ngàn đó nhưng mà em hỗ trợ cho mình nếu mà chị còn hợp đồng cũ thì em hỗ trợ cho mình góp trong vòng mười hai tháng còn nếu a em xin lỗi đóng lãi trong vòng mười hai tháng còn nếu mà chị dứt hợp đồng cũ rồi thì mình chuyển sang mình góp luôn nha chị ha"]
test_x = vectorizer.transform(test_x)
predicted_results = clf.predict(test_x)
mlb.inverse_transform(predicted_results)

[('Agent_EMI',)]

In [161]:
labels[1100:1150]

['Client_NotRequireLoan',
 'Agent_VerifyCustomerName',
 'Agent_Greetings,Agent_SelfIntroduction',
 'Agent_CallPurpose',
 'Agent_InterestRate',
 'Agent_EMI',
 'Agent_FamilyShopping,Agent_EMI',
 'Agent_ExplainDocsRequired',
 'Agent_FamilyShopping,Agent_RunBusiness',
 'Agent_FamilyShopping',
 'Agent_OH_NotRequireLoan',
 'Agent_OH_NotRequireLoan',
 'Agent_OH_NotRequireLoan',
 'Agent_OH_NotRequireLoan',
 'Agent_OH_NotRequireLoan',
 'Agent_OH_DiscussWithFamily',
 'Agent_InformCallBack',
 'Agent_InformCallBack',
 'Agent_OH_DiscussWithFamily',
 'Agent_ListeningSkill',
 'Agent_OH_DiscussWithFamily',
 'Agent_OH_BadExperience',
 'Agent_OH_BadExperience',
 'Agent_OH_BadExperience',
 'Agent_InformCallBack',
 'Agent_InformCallBack',
 'Client_DiscussWithFamily',
 'Client_DiscussWithFamily',
 'Client_BadExperience',
 'Agent_Greetings,Agent_SelfIntroduction,Agent_CallPurpose',
 'Agent_InterestRate',
 'Agent_RunBusiness',
 'Agent_RunBusiness',
 'Agent_Thanks,Agent_ListeningSkill',
 'Agent_VerifyCustomer

In [98]:
from sklearn.datasets import make_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
import numpy as np
X, y1 = make_classification(n_samples=10, n_features=100, n_informative=30, n_classes=3, random_state=1)
y2 = shuffle(y1, random_state=1)
y3 = shuffle(y1, random_state=2)
Y = np.vstack((y1, y2, y3)).T
n_samples, n_features = X.shape # 10,100
n_outputs = Y.shape[1] # 3
n_classes = 3
forest = RandomForestClassifier(random_state=1)
multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)
multi_target_forest.fit(X, Y).predict(X)

<2170x11093 sparse matrix of type '<class 'numpy.float64'>'
	with 57227 stored elements in Compressed Sparse Row format>