In [1]:
%load_ext autoreload
%autoreload 2

from time import time
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import pickle

# Load data files

In [2]:
train_df = pd.read_pickle("../data/datasets/train_df.pkl")
test_df = pd.read_pickle("../data/datasets/test_df.pkl")

In [3]:
train_features = pd.read_pickle("../data/gbdt_features/train_features_step2_all_feat_isolation.pkl")
test_features= pd.read_pickle("../data/gbdt_features/test_features_step2_all_feat_isolation.pkl")

In [4]:
print(train_df.shape)
print(test_df.shape)
print(train_features.shape)
print(test_features.shape)

(1564, 3)
(194, 3)
(1564, 64)
(194, 64)


# DROP SE lines

In [5]:
# DROP train lines (for examples labeled SE)
drop_train_lines = train_df.index[train_df[2] == "SE"].tolist()
print(drop_train_lines)

s2_train_features = train_features.drop(drop_train_lines)
s2_train_df = train_df.drop(drop_train_lines)

[0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 19, 23, 24, 25, 26, 28, 29, 34, 37, 43, 44, 53, 54, 55, 57, 59, 62, 63, 67, 73, 76, 77, 78, 79, 80, 83, 87, 88, 89, 91, 92, 93, 94, 95, 96, 98, 105, 106, 115, 126, 128, 130, 133, 135, 139, 140, 141, 142, 150, 151, 153, 154, 156, 158, 159, 160, 161, 168, 172, 173, 177, 179, 180, 181, 183, 188, 190, 193, 195, 196, 198, 202, 204, 206, 209, 210, 212, 214, 218, 222, 231, 234, 239, 246, 250, 253, 262, 269, 278, 285, 287, 289, 290, 296, 297, 299, 301, 302, 306, 308, 309, 312, 315, 316, 320, 322, 323, 327, 331, 334, 339, 342, 345, 347, 349, 350, 356, 357, 361, 362, 364, 368, 371, 373, 374, 383, 385, 388, 389, 392, 393, 396, 398, 402, 406, 408, 410, 412, 417, 419, 420, 422, 426, 427, 429, 436, 440, 446, 447, 450, 461, 465, 466, 467, 468, 471, 480, 483, 484, 486, 490, 497, 502, 508, 510, 512, 514, 517, 520, 522, 523, 532, 534, 540, 542, 544, 554, 559, 564, 569, 572, 574, 579, 584, 588, 589, 590, 591, 593, 594, 599, 600, 601, 603, 604, 609, 614, 618, 622, 626, 6

In [6]:
# drop the rest of SE for eval
drop_test_lines = test_df.index[test_df[2] == "SE"].tolist()
print(drop_test_lines)
print(len(drop_test_lines))

s2_test_features = test_features.drop(drop_test_lines)
s2_test_df = test_df.drop(drop_test_lines)
print(s2_test_df)

[0, 2, 5, 9, 10, 11, 14, 15, 20, 29, 30, 33, 39, 44, 47, 51, 54, 56, 57, 59, 61, 64, 71, 77, 81, 82, 83, 84, 90, 96, 98, 104, 106, 110, 112, 113, 114, 119, 120, 123, 124, 127, 132, 134, 137, 138, 140, 143, 145, 149, 155, 156, 165, 166, 169, 170, 172, 174, 175, 177, 178, 181, 183, 185, 186, 188, 190, 193]
68
         0                                        1   2
1     31_2                         Is it treatable?  FT
3     31_4                   What are its symptoms?  PT
4     31_5             Can it spread to the throat?  PT
6     31_7            What is the first sign of it?  PT
7     31_8     Is it the same as esophageal cancer?  PT
..     ...                                      ...  ..
184  78_10        What is the best for weight loss?  PT
187   79_3    What is the role of positivism in it?  FT
189   79_5        How is his work related to Comte?  PT
191   79_7              What is its main criticism?  PT
192   79_8  How does it compare to conflict theory?  PT

[126 rows x 3 colu

In [7]:
print(train_features.shape) # incl. SE
print(s2_train_features.shape) # without SE

print(test_features.shape) # incl. SE
print(s2_test_features.shape) # without SE

(1564, 64)
(1096, 64)
(194, 64)
(126, 64)


# Train classifier

In [8]:
import lightgbm as lgb
from lightgbm import LGBMClassifier

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [21]:
# clean the df
cols = train_features.columns[2:]
print(cols)

# train
X_train_df = s2_train_features[cols]
X_train = s2_train_features[cols].to_numpy()

train_labels = s2_train_df[2].map({'FT': 1, 'PT': 0})
y_train_df = train_labels
y_train = train_labels.to_numpy()
print("TRAIN positive examples {} FT in total examples {}".format(sum(y_train), len(y_train)))

# test WITH ALL UTT
X_test_df = s2_test_features[cols]
X_test = s2_test_features[cols].to_numpy()
test_labels = s2_test_df[2].map({'FT': 1, 'PT': 0})
y_test_df = test_labels
y_test = test_labels.to_numpy()
print("TEST positive examples {} FT in total examples {} ".format(sum(y_test), len(y_test)))

Index(['utt_len', 'num_tokens', 'complete_sent', 'question_mark', 'ner',
       'ner_b', 'ner_tm_0', 'ner_tm_1', 'ner_tm_b', 'noun', 'noun_b', 'adj',
       'adj_b', 'adj_comp', 'adj_comp_b', 'adv', 'adv_b', 'adv_comp',
       'adv_comp_b', 'pron', 'pron_b', 'pron_3rd', 'pron_3rd_b', 'cue_ph',
       'cue_ph_b', 'cue_kw', 'cue_kw_b', 'cue_ex', 'cue_ex_b', 'cue_comp',
       'cue_comp_b', 'question', 'question_b', 'question_ph', 'question_ph_b',
       'what', 'where', 'when', 'who', 'why', 'which', 'how', 'how_much',
       'how_many', 'how_long', 'what_is', 'what_is_2', 'what_is_3',
       'tell_me_question', 'n_chunks', 'question_ph_2', 'question_ph_2_b',
       'ques_mark_it', 'turn', 'cosine_first', 'cosine_prev', 'is_next_first',
       'is_next_prev', 'nc_cosine_first', 'nc_cosine_prev', 'is_next_of_SE',
       'dist_last_SE'],
      dtype='object')
TRAIN positive examples 857 FT in total examples 1096
TEST positive examples 69 FT in total examples 126 


In [22]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train_df, y_train_df)
lgb_eval = lgb.Dataset(X_test_df, y_test_df, reference=lgb_train)

In [23]:
# specify your configurations as a dict
print('Starting training...')
lgb_estimator = lgb.LGBMClassifier(
                                   boosting_type = 'gbdt',
                                   objective = 'binary' ,
                                   learning_rate = 0.01,
                                   random_state = 42,
                                   num_leaves = 64,
                                   n_estimators = 1500,
                                   min_child_samples = 5
                                   )

Starting training...


In [24]:
lgb_estimator.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.01, max_depth=-1,
               min_child_samples=5, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=1500, n_jobs=-1, num_leaves=64, objective='binary',
               random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [25]:
print_model = False

if print_model:
    print('Saving model...')
    # # save model to file
    lgb_estimator.booster_.save_model('../data/gbdt_models/Step2_lightGBM.txt')

# Grid search 

In [26]:
from sklearn.model_selection import GridSearchCV

# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

            
# use a full grid over all parameters
param_grid = {
              'num_leaves': [32,64, 128],
              'n_estimators': [1250, 1500, 1750, 2000, 2500],
              'min_child_samples': [2, 5, 10]
}


run_grid_search = False
if run_grid_search:
    # run grid search
    grid_search = GridSearchCV(lgb_estimator, param_grid=param_grid, cv=10)
    start = time()
    grid_search.fit(X_train, y_train)
    
    print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
          % (time() - start, len(grid_search.cv_results_['params'])))
    report(grid_search.cv_results_)

# Evaluation

In [27]:
print('Starting predicting...')
y_pred = lgb_estimator.predict(X_test)

Starting predicting...


In [28]:
y_pred = [int(round(x)) for x in y_pred]

In [29]:
from sklearn.metrics import mean_squared_error

# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

The rmse of prediction is: 0.5194624816493197


In [30]:
from sklearn.metrics import accuracy_score
print("Accuracy : ", accuracy_score(y_test, y_pred, normalize = True))

Accuracy :  0.7301587301587301


In [38]:
from sklearn.metrics import confusion_matrix 

conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

[[36 21]
 [13 56]]


In [39]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[36 21]
 [13 56]]
              precision    recall  f1-score   support

           0       0.73      0.63      0.68        57
           1       0.73      0.81      0.77        69

    accuracy                           0.73       126
   macro avg       0.73      0.72      0.72       126
weighted avg       0.73      0.73      0.73       126



In [40]:
test_index = list(s2_test_df.index)

res = [(a,b,c) for (a,b,c) in zip(y_test, y_pred, test_index) if  a != b]
print(lgb_estimator.__class__)
print("Missclassification ",len(res), "/", len(y_pred))
print("False positives ", sum([b for _,b,_ in res]))

<class 'lightgbm.sklearn.LGBMClassifier'>
Missclassification  34 / 126
False positives  21


In [41]:
print("Misclassification cases -> go back to utterances")
print("y_test, y_pred, test_index (up to 194)")
for item in res:
    print(item, s2_test_df[0].loc[item[2]], s2_test_df[1].loc[item[2]])

Misclassification cases -> go back to utterances
y_test, y_pred, test_index (up to 194)
(0, 1, 4) 31_5 Can it spread to the throat?
(1, 0, 13) 32_5 What's the biggest ever caught?
(1, 0, 22) 33_3 How was it received?
(1, 0, 23) 33_4 Did it win any awards?
(0, 1, 25) 33_6 Who was the author and when what it published?
(1, 0, 36) 34_7 What about environmental factors?
(1, 0, 37) 34_8 What empires survived?
(1, 0, 38) 34_9 What came after it?
(0, 1, 46) 37_8 What were the similarities and differences between the studies?
(0, 1, 55) 40_5 How has it been integrated into music education?
(0, 1, 60) 40_10 What are its roots and what influenced it?
(1, 0, 65) 49_5 What are its other competitors?
(1, 0, 69) 49_9 How has it impacted society?
(1, 0, 70) 49_10 How about dating and relationships?
(0, 1, 76) 50_6 Why did it create tension with the US?
(0, 1, 97) 56_8 What is the relationship to speciation?
(1, 0, 108) 59_3 What is the ACL?
(0, 1, 109) 59_4 What is an injury for it?
(0, 1, 121) 61_8 

In [42]:
from operator import itemgetter
feat_importance = [(a,b) for a, b in zip(lgb_estimator.feature_importances_, cols)]
feat_importance.sort(key=itemgetter(0),reverse=True)
feat_importance

[(13954, 'cosine_prev'),
 (12297, 'is_next_prev'),
 (11379, 'is_next_first'),
 (9255, 'cosine_first'),
 (8511, 'utt_len'),
 (5284, 'turn'),
 (3359, 'dist_last_SE'),
 (3075, 'nc_cosine_prev'),
 (2940, 'ner_tm_1'),
 (2870, 'num_tokens'),
 (2091, 'ner_tm_0'),
 (1973, 'nc_cosine_first'),
 (1688, 'noun'),
 (1669, 'n_chunks'),
 (1516, 'adj_b'),
 (1384, 'adv'),
 (1105, 'pron'),
 (1088, 'complete_sent'),
 (965, 'question_b'),
 (924, 'ner'),
 (887, 'question'),
 (628, 'pron_3rd'),
 (549, 'what_is'),
 (542, 'is_next_of_SE'),
 (373, 'how'),
 (364, 'tell_me_question'),
 (360, 'where'),
 (356, 'when'),
 (343, 'question_ph'),
 (323, 'which'),
 (305, 'who'),
 (281, 'cue_ex'),
 (272, 'what_is_3'),
 (204, 'question_ph_2'),
 (190, 'how_much'),
 (182, 'why'),
 (176, 'adj_comp'),
 (158, 'cue_ph'),
 (132, 'ques_mark_it'),
 (107, 'cue_comp'),
 (105, 'what_is_2'),
 (69, 'noun_b'),
 (67, 'question_mark'),
 (67, 'adv_comp'),
 (42, 'how_many'),
 (37, 'ner_tm_b'),
 (22, 'pron_b'),
 (15, 'pron_3rd_b'),
 (13, 'adv