In [None]:
%load_ext autoreload
%autoreload 2

from time import time
import pandas as pd
import numpy as np
import os
import pickle

# Load data from Step 1

In [None]:
train_df = pd.read_pickle("../data/datasets/train_df.pkl")
test_df = pd.read_pickle("../data/datasets/test_df.pkl")

In [None]:
train_features = pd.read_pickle("../data/gbdt_features/train_features_step1.pkl")
test_features= pd.read_pickle("../data/gbdt_features/test_features_step1.pkl")

In [None]:
print(train_df.shape)
print(test_df.shape)
print(train_features.shape)
print(test_features.shape)

# Generate UTT features - STEP 2

In [None]:
import sys
sys.path.append("..")
from conversationalir.uttclassification.conversation_features import utterance_cosine_similarity_first, utterance_cosine_similarity_previous, is_next_sentence_to_first_neural, is_next_sentence_to_previous_neural, compute_dist_last_SE_train, compute_is_next_of_SE_train, compute_is_next_of_SE_test, compute_dist_last_SE_test, noun_chunks_cosine_similarity_first, noun_chunks_cosine_similarity_previous 

In [None]:
def uttFeatures2(df, feature_df):
    
    feature_df["turn"] = feature_df[0].str.split("_").str[1].astype(int)    
    
    cosine_first = utterance_cosine_similarity_first(df)
    feature_df["cosine_first"] = feature_df[0].map(cosine_first)
    
    cosine_prev = utterance_cosine_similarity_previous(df)
    feature_df["cosine_prev"] = feature_df[0].map(cosine_prev)
    
    is_next_first = is_next_sentence_to_first_neural(df)
    feature_df["is_next_first"] = feature_df[0].map(is_next_first)
    
    is_next_prev = is_next_sentence_to_previous_neural(df)
    feature_df["is_next_prev"] = feature_df[0].map(is_next_prev)

    return feature_df

In [None]:
train_features = uttFeatures2(train_df, train_features)

In [None]:
test_features = uttFeatures2(test_df, test_features)

In [None]:
def uttFeatures4(df, feature_df):
    
    nc_cosine_first = noun_chunks_cosine_similarity_first(df, feature_df)
    feature_df["nc_cosine_first"] = feature_df[0].map(nc_cosine_first)

    nc_cosine_prev = noun_chunks_cosine_similarity_previous(df, feature_df)
    feature_df["nc_cosine_prev"] = feature_df[0].map(nc_cosine_prev)
    
    return feature_df

In [None]:
train_features = uttFeatures4(train_df, train_features)

In [None]:
test_features = uttFeatures4(test_df, test_features)

In [None]:
def uttFeatures3_train(train_df, feature_df):
    
    is_next_of_SE = compute_is_next_of_SE_train(train_df)
    feature_df["is_next_of_SE"] = feature_df[0].map(is_next_of_SE)
    
    dist_last_SE = compute_dist_last_SE_train(train_df)
    feature_df["dist_last_SE"] = feature_df[0].map(dist_last_SE)
    
    return feature_df

In [None]:
train_features = uttFeatures3_train(train_df, train_features)

In [None]:
# We skip the ones dependent on step 1, and run this IN ISOLATION
test_features = uttFeatures3_train(test_df, test_features)

In [None]:
print(train_features.shape)
print(test_features.shape)

In [None]:
print_files = False
if print_files:
    train_features.to_pickle("../data/gbdt_features/train_features_step2_all_feat_isolation.pkl")
    test_features.to_pickle("../data/gbdt_features/test_features_step2_all_feat_isolation.pkl")

## Remove step1 features (only use conversation features for step2)

In [None]:
train_feat_only_conv = train_features.drop(train_features.iloc[:, 2:55], axis=1)
test_feat_only_conv = test_features.drop(train_features.iloc[:, 2:55], axis=1)

In [None]:
print(train_feat_only_conv.shape)
print(test_feat_only_conv.shape)

In [None]:
print_files = False
if print_files:
    train_feat_only_conv.to_pickle("../data/gbdt_features/train_features_step2_conv_feat_isolation.pkl")
    test_feat_only_conv.to_pickle("../data/gbdt_features/test_features_step2_conv_feat_isolation.pkl")

# Features that depend on step 1 - only for TEST

In [None]:
def uttFeatures3_test(df, feature_df, y_pred):
    """
    The function is made for the TEST set since we assume from Step1 we only get 174 judgements
    
    """
    is_next_of_SE = compute_is_next_of_SE_test(test_df, y_pred)
    feature_df["is_next_of_SE"] = feature_df[0].map(is_next_of_SE)
    
    dist_last_SE = compute_dist_last_SE_test(test_df, y_pred)
    feature_df["dist_last_SE"] = feature_df[0].map(dist_last_SE)
    
    return feature_df

## when Step 1 is lightGBM

In [None]:
test_features= pd.read_pickle("../data/gbdt_features/test_features_step2_all_feat_isolation.pkl")

In [None]:
y_pred_S1_lightGBM = np.load("../data/gbdt_models/step_1_y_pred_lightGBM.npy")

In [None]:
test_features_cascade_LightGBM = uttFeatures3_test(test_df, test_features, y_pred_S1_lightGBM)

In [None]:
test_features_cascade_LightGBM.to_pickle("../data/gbdt_features/test_features_step2_all_feat_cascade_lightGBM.pkl")