In [None]:
!sudo apt-get install g++ openjdk-8-jdk python3-dev python3-pip curl
!python3 -m pip install --upgrade pip
!python3 -m pip install konlpy 
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

In [9]:
import pandas as pd
import numpy as np

from os.path import join as pjoin

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV   #Perforing grid search
from sklearn.metrics import f1_score


In [7]:
from konlpy.tag import Mecab
from typing import List
from ast import literal_eval

mecab = Mecab()

In [4]:
def concat_pos(pos_list : List[tuple], concat_pos_list : List[tuple]) -> List[tuple]:
    if not pos_list:
        return concat_pos_list

    if len(concat_pos_list)>0 and concat_pos_list[-1][1].startswith(('XSV', 'VV')) and pos_list[0][1] in ['EC', 'EP', 'EF']:
        concat_pos_list[-1] = (concat_pos_list[-1][0]+pos_list[0][0],
                            f"{concat_pos_list[-1][1]}+{pos_list[0][1]}")
        return concat_pos(pos_list[1:], concat_pos_list)
    
    return concat_pos(pos_list[1:], concat_pos_list + [pos_list[0]])

def analyze_syntactics(sent : str) -> tuple:
    pos_list = mecab.pos(sent)

    pos_list = concat_pos(pos_list, [])
    core_pos_list = [pos for pos, tag in pos_list if tag.startswith(('XSV', 'VV')) or tag in ['NNG', 'NNP']]
    return core_pos_list

In [5]:
def is_nan(string):
    return string != string

In [6]:
DATA_DIR = 'drive/MyDrive/Colab_Notebooks/Intellius/Emotional_Dialogue/proc_label_data'

In [10]:
train = pd.read_csv(pjoin(DATA_DIR, 'train.csv'), sep='\t', converters={
    'human-utter': literal_eval
})
valid = pd.read_csv(pjoin(DATA_DIR, 'valid.csv'), sep='\t', converters={
    'human-utter': literal_eval
})

test = pd.read_csv(pjoin(DATA_DIR, 'test.csv'), sep='\t', converters={
    'human-utter': literal_eval
})

In [30]:
train['human-utter-flat'] = list(map(lambda x: ' '.join(x), train['human-utter']))
train['query-pos'] = list(map(analyze_syntactics, train['human-utter-flat']))
train['query-pos-str'] = list(map(lambda x: ' '.join(x), train['query-pos']))

valid['human-utter-flat'] = list(map(lambda x: ' '.join(x), valid['human-utter']))
valid['query-pos'] = list(map(analyze_syntactics, valid['human-utter-flat']))
valid['query-pos-str'] = list(map(lambda x: ' '.join(x), valid['query-pos']))

In [31]:
dev = pd.concat([train, valid], ignore_index=True)

In [16]:
test['human-utter-flat'] = list(map(lambda x: ' '.join(x), test['human-utter']))
test['query-pos'] = list(map(analyze_syntactics, test['human-utter-flat']))
test['query-pos-str'] = list(map(lambda x: ' '.join(x), test['query-pos']))

In [70]:
x_dev = dev['query-pos-str']
y_dev = dev['target']

In [32]:
x_train = train['query-pos-str']
y_train = train['target']

x_valid = valid['query-pos-str']
y_valid = valid['target']

x_test = test['query-pos-str']
y_test = test['target']

In [33]:
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(x_train)
x_valid_counts = count_vect.transform(x_valid)
x_test_counts = count_vect.transform(x_test)

x_train_counts.shape

(30119, 21225)

In [71]:
dev_cnt_vect = CountVectorizer()
x_dev_counts = dev_cnt_vect.fit_transform(x_dev)
x_dev_test_counts = dev_cnt_vect.transform(x_test)

dev_tfidf_transformer = TfidfTransformer()
x_dev_tfidf = dev_tfidf_transformer.fit_transform(x_dev_counts)
x_dev_test_tfidf = dev_tfidf_transformer.transform(x_dev_test_counts)

In [34]:
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)
x_valid_tfidf = tfidf_transformer.transform(x_valid_counts)
x_test_tfidf = tfidf_transformer.transform(x_test_counts)

x_train_tfidf.shape

(30119, 21225)

In [40]:
xgb_wrapper = XGBClassifier(n_estimators = 5000, 
                            learning_rate = 0.1 , 
                            max_depth = 8,
                            min_child_weight=1,
                            gamma=0,
                            subsample=0.8,
                            colsample_bytree=0.8,
                            objective= 'multi:softmax',
                            num_class=4,
                            nthread=-1,
                            scale_pos_weight=1,
                            seed=42
                            )
evals = [(x_valid_tfidf, y_valid)]
xgb_wrapper.fit(x_train_tfidf, y_train, early_stopping_rounds = 100, 
                eval_set = evals, verbose=True)

ws100_preds = xgb_wrapper.predict(x_test_tfidf)

[0]	validation_0-merror:0.510887
Will train until validation_0-merror hasn't improved in 100 rounds.
[1]	validation_0-merror:0.469995
[2]	validation_0-merror:0.449549
[3]	validation_0-merror:0.443574
[4]	validation_0-merror:0.443176
[5]	validation_0-merror:0.441052
[6]	validation_0-merror:0.44291
[7]	validation_0-merror:0.443309
[8]	validation_0-merror:0.44291
[9]	validation_0-merror:0.440255
[10]	validation_0-merror:0.438794
[11]	validation_0-merror:0.436006
[12]	validation_0-merror:0.433617
[13]	validation_0-merror:0.432023
[14]	validation_0-merror:0.431758
[15]	validation_0-merror:0.431891
[16]	validation_0-merror:0.432687
[17]	validation_0-merror:0.431492
[18]	validation_0-merror:0.429368
[19]	validation_0-merror:0.427642
[20]	validation_0-merror:0.424456
[21]	validation_0-merror:0.423659
[22]	validation_0-merror:0.423261
[23]	validation_0-merror:0.423659
[24]	validation_0-merror:0.422862
[25]	validation_0-merror:0.4218
[26]	validation_0-merror:0.421402
[27]	validation_0-merror:0.4

In [41]:
np.mean(ws100_preds == y_test)

0.6554657592513824

In [76]:
from sklearn.tree import DecisionTreeClassifier

dt_wrapper = DecisionTreeClassifier(
                            criterion = 'entropy',
                            max_depth = 500,
                            random_state = 42
                            )
dt_wrapper.fit(x_dev_tfidf, y_dev)
dt_preds = dt_wrapper.predict(x_dev_test_tfidf)

In [77]:
np.mean(dt_preds == y_test)

0.5569970225435985

In [72]:
from sklearn.svm import SVC
svc_clf = SVC(
            C = 50, 
            kernel = 'rbf',
            gamma = 1
            )     
svc_clf.fit(x_dev_tfidf, y_dev)
svc_preds = svc_clf.predict(x_dev_test_tfidf)

In [73]:
np.mean(svc_preds == y_test)

0.6773713313483624

In [110]:
from sklearn.neighbors import KNeighborsClassifier

kn_clf = KNeighborsClassifier(
                        n_neighbors=300, 
                        algorithm='ball_tree',
                        leaf_size=30,
                        n_jobs=-1)

kn_clf.fit(x_dev_tfidf, y_dev)
kn_preds = kn_clf.predict(x_dev_test_tfidf)



In [111]:
f1_score(y_test, kn_preds, average='macro')

0.478540692772221

In [112]:
np.mean(kn_preds == y_test)

0.5076563164610803

In [None]:
param_test1 = {
 'max_depth':range(3,10,3),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate=0.1, 
                                                  n_estimators=1000, 
                                                  max_depth=5, 
                                                  min_child_weight=1, 
                                                  gamma=0, 
                                                  subsample=0.8, 
                                                  colsample_bytree=0.8,
                                                  objective= 'multi:softmax', 
                                                  nthread=-1, 
                                                  scale_pos_weight=1, seed=42),
                        param_grid = param_test1, 
                        scoring='accuracy',
                        n_jobs=-1,
                        cv=5, 
                        verbose=10)

gsearch1.fit(x_train_tfidf, y_train)
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_