In [28]:
from pprint import pprint
from pyvi import ViTokenizer

In [5]:
""" Load PhoATIS
"""
import os


data_path = "../../data/phoatis/"
train_path = os.path.join(data_path, "train")
val_path = os.path.join(data_path, "dev")
test_path = os.path.join(data_path, "test")


def load_file(path):
    lines = []
    with open(path, "r") as f:
        lines = f.readlines()

    return lines


def load_data(path):
    X, y = [], []
    X_path = os.path.join(path, "seq.in")
    y_path = os.path.join(path, "label")
    X = load_file(X_path)
    y = load_file(y_path)

    return X, y


def load_phoatis(data_root_path, split="all"):
    if split == "all":
        train_X, val_X, test_X = [], [], []
        train_y, val_y, test_y = [], [], []

        train_path = os.path.join(data_root_path, "train")
        val_path = os.path.join(data_root_path, "dev")
        test_path = os.path.join(data_root_path, "test")

        train_X, train_y = load_data(train_path)
        val_X, val_y = load_data(val_path)
        test_X, test_y = load_data(test_path)
        
        return (train_X, train_y), (val_X, val_y), (test_X, test_y)

In [15]:
# EXAMPLE
train, val, test = load_phoatis(data_path, split="all")
train_X, train_y = train[0], train[1]
val_X, val_y = val[0], val[1]
test_X, test_y = test[0], test[1]

In [None]:
""" Load VA-style based data
"""
import pandas as pd


def load_va_data(path):
    """ Load raw data
    
    Supported foramt: xlsx
    """
    X, y = [], []

    data = pd.read_excel(path, engine="openpyxl")
    X = data["Sample"].tolist()
    y = data["Intent"].tolist()

    return X, y

In [37]:
# EXAMPLE
PATH = "../../data/trungquan/trainset.xlsx"
X, y = load_va_data(PATH)

In [18]:
from emandai.utils import load_data_from_botid


data = load_data_from_botid("6268f7f89f455cd9eb292df4")
data.head(n=10)

INFO:oauth2client.transport:Attempting refresh to obtain initial access_token
INFO:oauth2client.client:Refreshing access_token


Unnamed: 0,Sentence,0_bat_dau_cuoc_goi,2_dong_y_trao_doi,1_nghe_khong_ro,1_goi_lai_sau,2_2_doi_so_dien_thoai,2_tu_choi,1_nham_so,2_1_dong_y_gui_zalo,1_xac_nhan_dung,2_Noi_thang_Zalo,2_3_khong_dung_zalo
0,nói như điện tử thế nhở,0,0,1,0,0,0,0,0,0,0,0
1,ừ em là máy à,0,0,1,0,0,0,0,0,0,0,0
2,à em để tự động,0,0,1,0,0,0,0,0,0,0,0
3,tổng đài à,0,0,1,0,0,0,0,0,0,0,0
4,tổng đài tự động à,0,0,1,0,0,0,0,0,0,0,0
5,nói gì nghe không rõ,0,0,1,1,0,0,0,0,0,0,0
6,nói gì thế,0,0,1,0,0,0,0,0,0,0,0
7,xin chào,1,0,0,0,0,0,0,0,0,0,0
8,hi,1,0,0,0,0,0,0,0,0,0,0
9,alo,1,0,0,0,0,0,0,0,0,0,0


In [30]:
# remove duplicate
data = data.drop_duplicates("Sentence")

# lower
data["Sentence"] = data["Sentence"].map(lambda x: x.lower())
data.head(n=10)

# word segmenation
data["Sentence"] = data["Sentence"].map(ViTokenizer.tokenize)

In [34]:
data.sample(n=10)

Unnamed: 0,Sentence,0_bat_dau_cuoc_goi,2_dong_y_trao_doi,1_nghe_khong_ro,1_goi_lai_sau,2_2_doi_so_dien_thoai,2_tu_choi,1_nham_so,2_1_dong_y_gui_zalo,1_xac_nhan_dung,2_Noi_thang_Zalo,2_3_khong_dung_zalo
344,em ở đâu_đấy,0,0,0,0,0,0,0,0,1,0,0
460,điện lại được không,0,0,0,1,0,0,0,0,0,0,0
303,ừ chị đấy em,0,0,0,0,0,0,0,0,1,0,0
136,ô_kê em ơi,0,1,0,0,0,0,0,1,0,0,0
360,gửi meo đi em,0,0,0,0,0,0,0,0,0,0,1
349,chết_thật lại còn gọi tự_động chứ,0,0,0,0,0,0,0,0,1,0,0
307,ừ đây,0,0,0,0,0,0,0,0,1,0,0
383,tối gọi lại,0,0,0,1,0,0,0,0,0,0,0
843,đừg gửi qua đây tui xài da lô số mới_rồi,0,0,0,0,1,0,0,0,0,0,0
463,tôi đâu có rảnh mà nói_chuyện,0,0,0,1,0,0,0,0,0,0,0


In [40]:
data[data.columns[1:]].head(n=5)
# convert to List[List[Text]]

Unnamed: 0,0_bat_dau_cuoc_goi,2_dong_y_trao_doi,1_nghe_khong_ro,1_goi_lai_sau,2_2_doi_so_dien_thoai,2_tu_choi,1_nham_so,2_1_dong_y_gui_zalo,1_xac_nhan_dung,2_Noi_thang_Zalo,2_3_khong_dung_zalo
0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,0


In [61]:
y = [row[row == 1].index.tolist() for _, row in data.iterrows()]
y

[['1_nghe_khong_ro'],
 ['1_nghe_khong_ro'],
 ['1_nghe_khong_ro'],
 ['1_nghe_khong_ro'],
 ['1_nghe_khong_ro'],
 ['1_nghe_khong_ro', '1_goi_lai_sau'],
 ['1_nghe_khong_ro'],
 ['0_bat_dau_cuoc_goi'],
 ['0_bat_dau_cuoc_goi'],
 ['0_bat_dau_cuoc_goi'],
 ['2_dong_y_trao_doi', '1_xac_nhan_dung'],
 ['2_dong_y_trao_doi', '1_xac_nhan_dung'],
 ['2_dong_y_trao_doi', '2_1_dong_y_gui_zalo', '1_xac_nhan_dung'],
 ['2_dong_y_trao_doi', '1_xac_nhan_dung'],
 ['2_dong_y_trao_doi', '2_1_dong_y_gui_zalo'],
 ['2_dong_y_trao_doi', '2_1_dong_y_gui_zalo', '1_xac_nhan_dung'],
 ['2_dong_y_trao_doi', '2_1_dong_y_gui_zalo', '1_xac_nhan_dung'],
 ['2_dong_y_trao_doi', '2_1_dong_y_gui_zalo'],
 ['2_dong_y_trao_doi', '1_xac_nhan_dung'],
 ['2_dong_y_trao_doi', '2_1_dong_y_gui_zalo', '1_xac_nhan_dung'],
 ['2_dong_y_trao_doi', '1_xac_nhan_dung'],
 ['2_dong_y_trao_doi', '1_xac_nhan_dung'],
 ['2_dong_y_trao_doi', '2_1_dong_y_gui_zalo', '1_xac_nhan_dung'],
 ['2_dong_y_trao_doi', '1_xac_nhan_dung'],
 ['2_dong_y_trao_doi', '2_1_d