In [1]:
%load_ext autoreload
%autoreload 2

from time import time
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import pickle

# Load data files

In [2]:
train_df = pd.read_pickle("../data/datasets/train_df.pkl")
test_df = pd.read_pickle("../data/datasets/test_df.pkl")

train_features = pd.read_pickle("../data/gbdt_features/train_features_step1.pkl")
test_features= pd.read_pickle("../data/gbdt_features/test_features_step1.pkl")

In [3]:
print(train_df.shape)
print(test_df.shape)
print(train_features.shape)
print(test_features.shape)

(1564, 3)
(194, 3)
(1564, 55)
(194, 55)


In [4]:
print(len(test_df[test_df[2] == 'SE'])) # number of SE
print(len(test_df[test_df[2] == 'FT']))
print(len(test_df[test_df[2] == 'PT']))

68
69
57


In [5]:
drop_first_turn_test_lines = test_df.index[test_df[0].str.split("_").str[1].astype(int) == 1].tolist()
print("how many first utterance in test set?",len(drop_first_turn_test_lines))

test_df_no_first_turn = test_df.drop(drop_first_turn_test_lines)
test_features_no_first_turn = test_features.drop(drop_first_turn_test_lines)

print(test_features_no_first_turn.shape)

how many first utterance in test set? 20
(174, 55)


In [6]:
test_df_no_first_turn.head(5)

Unnamed: 0,0,1,2
1,31_2,Is it treatable?,FT
2,31_3,Tell me about lung cancer.,SE
3,31_4,What are its symptoms?,PT
4,31_5,Can it spread to the throat?,PT
5,31_6,What causes throat cancer?,SE


# Train classifier

In [7]:
# clean the df
cols = train_features.columns[2:]
print("Features: ",cols)

# train
X_train_df = train_features[cols]
X_train = train_features[cols].to_numpy()

train_labels = train_df[2].map({'SE': 1, 'SE ': 1, 'FT ': 0, 'FT': 0, 'PT': 0})
y_train_df = train_labels
y_train = train_labels.to_numpy()
print("TRAIN positive examples {} SE of total examples {}".format(sum(y_train), len(y_train)))


# test WITHOUT FIRST TURN
X_test_df = test_features_no_first_turn[cols]
X_test = test_features_no_first_turn[cols].to_numpy()

test_labels = test_df_no_first_turn[2].map({'SE': 1, 'SE ': 1, 'FT ': 0, 'FT': 0, 'PT': 0})
y_test_df = test_labels
y_test = test_labels.to_numpy()
print("TEST positive examples {} SE + 20 first utt SE of total examples {} + 20 first utt".format(sum(y_test), len(y_test)))

Features:  Index(['utt_len', 'num_tokens', 'complete_sent', 'question_mark', 'ner',
       'ner_b', 'ner_tm_0', 'ner_tm_1', 'ner_tm_b', 'noun', 'noun_b', 'adj',
       'adj_b', 'adj_comp', 'adj_comp_b', 'adv', 'adv_b', 'adv_comp',
       'adv_comp_b', 'pron', 'pron_b', 'pron_3rd', 'pron_3rd_b', 'cue_ph',
       'cue_ph_b', 'cue_kw', 'cue_kw_b', 'cue_ex', 'cue_ex_b', 'cue_comp',
       'cue_comp_b', 'question', 'question_b', 'question_ph', 'question_ph_b',
       'what', 'where', 'when', 'who', 'why', 'which', 'how', 'how_much',
       'how_many', 'how_long', 'what_is', 'what_is_2', 'what_is_3',
       'tell_me_question', 'n_chunks', 'question_ph_2', 'question_ph_2_b',
       'ques_mark_it'],
      dtype='object')
TRAIN positive examples 468 SE of total examples 1564
TEST positive examples 48 SE + 20 first utt SE of total examples 174 + 20 first utt


In [8]:
import lightgbm as lgb
from lightgbm import LGBMClassifier

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [9]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train_df, y_train_df)
lgb_eval = lgb.Dataset(X_test_df, y_test_df, reference=lgb_train)

In [10]:
# specify your configurations as a dict
print('Init classifier...')
lgb_estimator = lgb.LGBMClassifier(
                                   boosting_type = 'gbdt',
                                   objective = 'binary' ,
                                   learning_rate = 0.01,
                                   random_state = 42,
                                   num_leaves = 128,
                                   n_estimators = 1500,
                                   min_child_samples = 10
                                   )

Init classifier...


In [11]:
from sklearn.model_selection import GridSearchCV

# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

            
# use a full grid over all parameters
param_grid = {
              'num_leaves': [64, 128, 256, 512],
              'n_estimators': [1000, 1250, 1500],
              'min_child_samples': [5, 10]
}


run_grid_search = False
if run_grid_search:
    # run grid search
    grid_search = GridSearchCV(lgb_estimator, param_grid=param_grid, cv=10)
    start = time()
    grid_search.fit(X_train, y_train)
    
    print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
          % (time() - start, len(grid_search.cv_results_['params'])))
    report(grid_search.cv_results_)

In [12]:
# RUN again the estimator class initialized with settings from Grid Search
print('Fitting the model...')
lgb_estimator.fit(X_train, y_train)

Fitting the model...


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.01, max_depth=-1,
               min_child_samples=10, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=1500, n_jobs=-1, num_leaves=128, objective='binary',
               random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [13]:
print_model = False

if print_model:
    print('Saving model...')
    # save model to file
    lgb_estimator.booster_.save_model('../data/gbdt_models/Step1_lightGBM.txt')

# Evaluation

In [14]:
print('Starting predicting...')
y_pred = lgb_estimator.predict(X_test)

Starting predicting...


In [15]:
print_y_pred = False

if print_y_pred:
    np.save("../data/gbdt_models/step_1_y_pred_lightGBM.npy", y_pred)

In [16]:
from sklearn.metrics import mean_squared_error
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

The rmse of prediction is: 0.46732301954611777


In [17]:
from sklearn.metrics import accuracy_score
print("Accuracy : ", accuracy_score(y_test, y_pred, normalize = True))

Accuracy :  0.7816091954022989


# Complete evaluation
We add all the 20 first turn utterances with label 1 at the end for both pred and test in order to have al 194 utterances from the test set.

In [18]:
all_y_pred = np.concatenate((y_pred, np.ones(20, dtype=int)))

print(all_y_pred)
print(len(all_y_pred))

[0 1 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 1 1 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1
 1 0 1 1 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 1 0 0 0 0 1 0 1 0 0 0 1 1 0 1 0 0 1
 0 1 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 1 1 0 0 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1]
194


In [19]:
all_y_test = np.concatenate((y_test, np.ones(20, dtype=int)))

print(all_y_test)
print(len(all_y_test))

[0 1 0 0 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 0 0 0 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1
 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 1 0 0 0 0 1 1 0 0 1
 0 0 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 1 0 0 1 1 0 1 0 1 0 1 1 0 0 1 0 1 0 1 0 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1]
194


In [20]:
# WITH FIRST TURN
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(all_y_test, all_y_pred))
print(classification_report(all_y_test, all_y_pred))

[[108  18]
 [ 20  48]]
              precision    recall  f1-score   support

           0       0.84      0.86      0.85       126
           1       0.73      0.71      0.72        68

    accuracy                           0.80       194
   macro avg       0.79      0.78      0.78       194
weighted avg       0.80      0.80      0.80       194



In [21]:
test_index = list(test_df_no_first_turn.index)

res = [(a,b,c) for (a,b,c) in zip(y_test, y_pred, test_index) if  a != b]
print(lgb_estimator.__class__)
print("Missclassification ",len(res), "/", len(y_pred))
print("False positives ", sum([b for _,b,_ in res]))

<class 'lightgbm.sklearn.LGBMClassifier'>
Missclassification  38 / 174
False positives  18


In [22]:
print("Misclassification cases -> go back to utterances")
print("y_test, y_pred, test_index (up to 194)")
for item in res:
    print(item, test_df_no_first_turn[0].loc[item[2]], test_df_no_first_turn[1].loc[item[2]])

Misclassification cases -> go back to utterances
y_test, y_pred, test_index (up to 194)
(1, 0, 5) 31_6 What causes throat cancer?
(1, 0, 10) 32_2 Are sharks endangered?  If so, which species?
(0, 1, 12) 32_4 What is the largest ever to have lived on Earth?
(1, 0, 14) 32_6 What about for great whites?
(0, 1, 28) 33_9 What are the differences between the book and movies?
(1, 0, 29) 33_10 Did the horse Artax really die?
(1, 0, 33) 34_4 Who were the Sea Peoples?
(0, 1, 43) 37_5 What are other similar experiments?
(0, 1, 46) 37_8 What were the similarities and differences between the studies?
(1, 0, 47) 37_9 What about the BBC experiment?
(1, 0, 59) 40_9 How did Britpop change music?
(0, 1, 73) 50_3 What are the important classes of satellite?
(0, 1, 75) 50_5 What is the Galileo system and why is it important?
(1, 0, 84) 54_4 Is the Spy Museum free?
(0, 1, 85) 54_5 What is there to do in DC after the museums close?
(0, 1, 86) 54_6 What is the best time to visit the reflecting pools?
(0, 1, 

In [23]:
from operator import itemgetter
feat_importance = [(a,b) for a, b in zip(lgb_estimator.feature_importances_, cols)]
feat_importance.sort(key=itemgetter(0),reverse=True)
feat_importance

[(48523, 'utt_len'),
 (17454, 'num_tokens'),
 (13837, 'ner_tm_0'),
 (12082, 'ner_tm_1'),
 (10144, 'noun'),
 (8930, 'n_chunks'),
 (7632, 'complete_sent'),
 (6989, 'adj_b'),
 (6188, 'ner'),
 (4722, 'question_b'),
 (4694, 'question'),
 (3800, 'adv'),
 (2954, 'what_is'),
 (2877, 'who'),
 (2402, 'pron'),
 (2059, 'how'),
 (1432, 'pron_3rd'),
 (1282, 'why'),
 (963, 'question_mark'),
 (928, 'what_is_3'),
 (751, 'question_ph_2'),
 (740, 'which'),
 (739, 'where'),
 (635, 'adj_comp'),
 (590, 'when'),
 (590, 'what_is_2'),
 (566, 'cue_comp'),
 (438, 'cue_kw'),
 (369, 'ner_b'),
 (253, 'noun_b'),
 (204, 'question_ph'),
 (157, 'adv_comp'),
 (149, 'how_many'),
 (145, 'tell_me_question'),
 (37, 'adv_b'),
 (30, 'ques_mark_it'),
 (26, 'cue_ph'),
 (2, 'pron_3rd_b'),
 (1, 'pron_b'),
 (1, 'cue_comp_b'),
 (0, 'ner_tm_b'),
 (0, 'adj'),
 (0, 'adj_comp_b'),
 (0, 'adv_comp_b'),
 (0, 'cue_ph_b'),
 (0, 'cue_kw_b'),
 (0, 'cue_ex'),
 (0, 'cue_ex_b'),
 (0, 'question_ph_b'),
 (0, 'what'),
 (0, 'how_much'),
 (0, 'how_lo

# Classification results for cascade 

In [25]:
# take all qids which are not SE
qids = test_df_no_first_turn[0].to_numpy()
qids

array(['31_2', '31_3', '31_4', '31_5', '31_6', '31_7', '31_8', '31_9',
       '32_2', '32_3', '32_4', '32_5', '32_6', '32_7', '32_8', '32_9',
       '32_10', '32_11', '33_2', '33_3', '33_4', '33_5', '33_6', '33_7',
       '33_8', '33_9', '33_10', '34_2', '34_3', '34_4', '34_5', '34_6',
       '34_7', '34_8', '34_9', '37_2', '37_3', '37_4', '37_5', '37_6',
       '37_7', '37_8', '37_9', '37_10', '37_11', '37_12', '40_2', '40_3',
       '40_4', '40_5', '40_6', '40_7', '40_8', '40_9', '40_10', '49_2',
       '49_3', '49_4', '49_5', '49_6', '49_7', '49_8', '49_9', '49_10',
       '50_2', '50_3', '50_4', '50_5', '50_6', '50_7', '50_8', '50_9',
       '50_10', '54_2', '54_3', '54_4', '54_5', '54_6', '54_7', '54_8',
       '54_9', '56_2', '56_3', '56_4', '56_5', '56_6', '56_7', '56_8',
       '58_2', '58_3', '58_4', '58_5', '58_6', '58_7', '58_8', '59_2',
       '59_3', '59_4', '59_5', '59_6', '59_7', '59_8', '61_2', '61_3',
       '61_4', '61_5', '61_6', '61_7', '61_8', '61_9', '67_2', '67_3

In [26]:
test_results_SE = [(i,a,b) for  i,a,b in zip(test_index, qids, y_pred) if  b==1]
print(len(test_results_SE))
print(test_results_SE)
test_results_nonSE = [(i,a,b) for i,a,b in zip(test_index, qids, y_pred) if  b==0]
print(len(test_results_nonSE)) 
print(test_results_nonSE)
      
assert len(test_results_SE)+len(test_results_nonSE)==174 # 174 test qid + 20 first utterance SE

queries_class_SE = [a for i,a,b in test_results_SE]
print(queries_class_SE)

46
[(2, '31_3', 1), (11, '32_3', 1), (12, '32_4', 1), (15, '32_7', 1), (28, '33_9', 1), (43, '37_5', 1), (44, '37_6', 1), (46, '37_8', 1), (54, '40_4', 1), (56, '40_6', 1), (57, '40_7', 1), (64, '49_4', 1), (73, '50_3', 1), (75, '50_5', 1), (77, '50_7', 1), (82, '54_2', 1), (83, '54_3', 1), (85, '54_5', 1), (86, '54_6', 1), (95, '56_6', 1), (96, '56_7', 1), (100, '58_3', 1), (103, '58_6', 1), (104, '58_7', 1), (110, '59_5', 1), (112, '59_7', 1), (117, '61_4', 1), (118, '61_5', 1), (120, '61_7', 1), (124, '67_2', 1), (126, '67_4', 1), (132, '67_10', 1), (135, '68_2', 1), (137, '68_4', 1), (139, '68_6', 1), (140, '68_7', 1), (143, '68_10', 1), (149, '69_5', 1), (163, '75_9', 1), (168, '77_4', 1), (169, '77_5', 1), (172, '77_8', 1), (174, '77_10', 1), (181, '78_7', 1), (186, '79_2', 1), (188, '79_4', 1)]
128
[(1, '31_2', 0), (3, '31_4', 0), (4, '31_5', 0), (5, '31_6', 0), (6, '31_7', 0), (7, '31_8', 0), (8, '31_9', 0), (10, '32_2', 0), (13, '32_5', 0), (14, '32_6', 0), (16, '32_8', 0), (1

## Write on a file the results of step 1 for INDRI

In [28]:
test_results_step1 = [(i,a,b) for  i,a,b in zip(test_index, qids, y_pred)]
test_results_step1

[(1, '31_2', 0),
 (2, '31_3', 1),
 (3, '31_4', 0),
 (4, '31_5', 0),
 (5, '31_6', 0),
 (6, '31_7', 0),
 (7, '31_8', 0),
 (8, '31_9', 0),
 (10, '32_2', 0),
 (11, '32_3', 1),
 (12, '32_4', 1),
 (13, '32_5', 0),
 (14, '32_6', 0),
 (15, '32_7', 1),
 (16, '32_8', 0),
 (17, '32_9', 0),
 (18, '32_10', 0),
 (19, '32_11', 0),
 (21, '33_2', 0),
 (22, '33_3', 0),
 (23, '33_4', 0),
 (24, '33_5', 0),
 (25, '33_6', 0),
 (26, '33_7', 0),
 (27, '33_8', 0),
 (28, '33_9', 1),
 (29, '33_10', 0),
 (31, '34_2', 0),
 (32, '34_3', 0),
 (33, '34_4', 0),
 (34, '34_5', 0),
 (35, '34_6', 0),
 (36, '34_7', 0),
 (37, '34_8', 0),
 (38, '34_9', 0),
 (40, '37_2', 0),
 (41, '37_3', 0),
 (42, '37_4', 0),
 (43, '37_5', 1),
 (44, '37_6', 1),
 (45, '37_7', 0),
 (46, '37_8', 1),
 (47, '37_9', 0),
 (48, '37_10', 0),
 (49, '37_11', 0),
 (50, '37_12', 0),
 (52, '40_2', 0),
 (53, '40_3', 0),
 (54, '40_4', 1),
 (55, '40_5', 0),
 (56, '40_6', 1),
 (57, '40_7', 1),
 (58, '40_8', 0),
 (59, '40_9', 0),
 (60, '40_10', 0),
 (62, '49_2

In [30]:
for i in range(0, len(test_results_step1)):
    label_expanded = "noSE"
    conv_utt_id = test_results_step1[i][1]
    conv_id = str(conv_utt_id).split("_")[0]
    utt_id = str(conv_utt_id).split("_")[1]
    if utt_id == "2":
        print(str(conv_id)+"_1"+"\tSE") #first utterance is always an SE
    if test_results_step1[i][2] == 1:
         label_expanded = "SE"      
    print(str(conv_utt_id)+"\t"+label_expanded)

31_1	SE
31_2	noSE
31_3	SE
31_4	noSE
31_5	noSE
31_6	noSE
31_7	noSE
31_8	noSE
31_9	noSE
32_1	SE
32_2	noSE
32_3	SE
32_4	SE
32_5	noSE
32_6	noSE
32_7	SE
32_8	noSE
32_9	noSE
32_10	noSE
32_11	noSE
33_1	SE
33_2	noSE
33_3	noSE
33_4	noSE
33_5	noSE
33_6	noSE
33_7	noSE
33_8	noSE
33_9	SE
33_10	noSE
34_1	SE
34_2	noSE
34_3	noSE
34_4	noSE
34_5	noSE
34_6	noSE
34_7	noSE
34_8	noSE
34_9	noSE
37_1	SE
37_2	noSE
37_3	noSE
37_4	noSE
37_5	SE
37_6	SE
37_7	noSE
37_8	SE
37_9	noSE
37_10	noSE
37_11	noSE
37_12	noSE
40_1	SE
40_2	noSE
40_3	noSE
40_4	SE
40_5	noSE
40_6	SE
40_7	SE
40_8	noSE
40_9	noSE
40_10	noSE
49_1	SE
49_2	noSE
49_3	noSE
49_4	SE
49_5	noSE
49_6	noSE
49_7	noSE
49_8	noSE
49_9	noSE
49_10	noSE
50_1	SE
50_2	noSE
50_3	SE
50_4	noSE
50_5	SE
50_6	noSE
50_7	SE
50_8	noSE
50_9	noSE
50_10	noSE
54_1	SE
54_2	SE
54_3	SE
54_4	noSE
54_5	SE
54_6	SE
54_7	noSE
54_8	noSE
54_9	noSE
56_1	SE
56_2	noSE
56_3	noSE
56_4	noSE
56_5	noSE
56_6	SE
56_7	SE
56_8	noSE
58_1	SE
58_2	noSE
58_3	SE
58_4	noSE
58_5	noSE
58_6	SE
58_7	SE
58_8	noSE
5