In [1]:
%load_ext autoreload
%autoreload 2

from time import time
import pandas as pd
import numpy as np
import os
import pickle

# Load data files

In [2]:
#CAST data
trainFile = "../data/datasets/training_set.tsv"
testFile = "../data/datasets/test_set.tsv"

train_df = pd.read_csv(trainFile, delimiter="\t", header=None)
test_df = pd.read_csv(testFile, delimiter="\t", header=None)

In [3]:
train_feat_df = train_df[[0]].copy()
test_feat_df = test_df[[0]].copy()

In [4]:
import spacy
nlp = spacy.load("en_core_web_sm")

# added a column to the nlp linguistic annotation features
train_feat_df["doc"] = train_df[1].apply(nlp)
test_feat_df["doc"] = test_df[1].apply(nlp)

In [5]:
train_feat_df.head()

Unnamed: 0,0,doc
0,1_1,"(What, is, a, physician, 's, assistant, ?)"
1,1_2,"(What, are, the, educational, requirements, re..."
2,1_3,"(What, does, it, cost, ?)"
3,1_4,"(What, 's, the, average, starting, salary, in,..."
4,1_5,"(What, about, in, the, US, ?)"


In [6]:
train_feat_df.tail()

Unnamed: 0,0,doc
1559,395_1,"(Who, created, The, Orville, ?)"
1560,395_2,"(What, network, is, it, on, ?)"
1561,395_3,"(What, 's, the, runtime, of, an, episode, ?)"
1562,395_4,"(What, was, the, airdate, of, the, first, epis..."
1563,395_5,"(What, 's, the, name, of, the, actress, who, i..."


In [7]:
test_feat_df.head()

Unnamed: 0,0,doc
0,31_1,"(What, is, throat, cancer, ?)"
1,31_2,"(Is, it, treatable, ?)"
2,31_3,"(Tell, me, about, lung, cancer, .)"
3,31_4,"(What, are, its, symptoms, ?)"
4,31_5,"(Can, it, spread, to, the, throat, ?)"


# Generate utterance features - STEP 1

In [8]:
import tagme
# Set the authorization token for subsequent calls.
tagme.GCUBE_TOKEN = "9cea67c1-bc33-40fc-8693-276cd4dfa693-843339462"

In [10]:
import sys
sys.path.append("..")
from classification.utterance_features import *


def uttFeatures(df, feature_df):
    
    #utterance features
    feature_df["utt_len"] = df[1].str.len()
    feature_df["num_tokens"] = feature_df["doc"].str.len()
    feature_df["complete_sent"] = feature_df["doc"].apply(complete_sentence)
    feature_df["question_mark"] = feature_df["doc"].apply(question_mark)
    
    #NER
    feature_df['ner'] = feature_df["doc"].apply(ner) # or lambda
    feature_df['ner_b'] = feature_df["doc"].apply(ner_binary) 
    
    #NER with TagMe
    feature_df['ner_tm_0'] = df[1].apply(ner_tagme) # or lambda
    feature_df['ner_tm_1'] = df[1].apply(ner_tagme, threshold=0.1) # or lambda
    feature_df['ner_tm_b'] = df[1].apply(ner_tagme_binary) 
    
    #nouns
    feature_df['noun'] = feature_df["doc"].apply(noun)
    feature_df['noun_b'] = feature_df["doc"].apply(noun_binary)
    
    #adjectives
    feature_df['adj'] = feature_df["doc"].apply(adj)
    feature_df['adj_b'] = feature_df["doc"].apply(adj_binary)
    
    feature_df['adj_comp'] = feature_df["doc"].apply(adj_comp)
    feature_df['adj_comp_b'] = feature_df["doc"].apply(adj_comp_binary)
    
    #adverbs
    feature_df['adv'] = feature_df["doc"].apply(adv)
    feature_df['adv_b'] = feature_df["doc"].apply(adv_binary)
    
    feature_df['adv_comp'] = feature_df["doc"].apply(adv_comp)
    feature_df['adv_comp_b'] = feature_df["doc"].apply(adv_comp_binary)
    

    #pronouns
    feature_df['pron'] = feature_df["doc"].apply(pron)
    feature_df['pron_b'] = feature_df["doc"].apply(pron_binary)
    
    feature_df['pron_3rd'] = feature_df["doc"].apply(pron_3rd)
    feature_df['pron_3rd_b'] = feature_df["doc"].apply(pron_3rd_binary)
   
    #cue phases, such as "tell me about" "tell me more about" "give me"
    feature_df['cue_ph'] = df[1].apply(cue_phrase, phrases_to_check = cue_phrases)
    feature_df['cue_ph_b'] = df[1].apply(cue_phrase_binary, phrases_to_check = cue_phrases)
    

    #cue keywords, such as "describe" and example or comparison keywords  
    feature_df['cue_kw'] = feature_df["doc"].apply(cue_keyword, kw_to_check = cue_kw)
    feature_df['cue_kw_b'] = feature_df["doc"].apply(cue_keyword_binary, kw_to_check = cue_kw)
    
    feature_df['cue_ex'] = feature_df["doc"].apply(cue_keyword, kw_to_check = example_kw)
    feature_df['cue_ex_b'] = feature_df["doc"].apply(cue_keyword_binary, kw_to_check = example_kw)
    
    feature_df['cue_comp'] = feature_df["doc"].apply(cue_keyword, kw_to_check = comparison_kw)
    feature_df['cue_comp_b'] = feature_df["doc"].apply(cue_keyword_binary, kw_to_check = comparison_kw)
    
    #questions (one word, e.g., "what", "when")
    feature_df['question'] = feature_df["doc"].apply(question, kw_to_check = question_kw)
    feature_df['question_b'] = feature_df["doc"].apply(question_binary, kw_to_check = question_kw)
    
    #questions (more than one word, e.g., "how many")
    feature_df['question_ph'] = df[1].apply(question_phrase, phrases_to_check = question_phrases)
    feature_df['question_ph_b'] = df[1].apply(question_phrase_binary, phrases_to_check = question_phrases)

   
    #check single question kw
    feature_df['what'] = feature_df["doc"].apply(question_binary, kw_to_check = ["what"])
    feature_df['where'] = feature_df["doc"].apply(question_binary, kw_to_check = ["where"])
    feature_df['when'] = feature_df["doc"].apply(question_binary, kw_to_check = ["when"])
    feature_df['who'] = feature_df["doc"].apply(question_binary, kw_to_check = ["who"])
    feature_df['why'] = feature_df["doc"].apply(question_binary, kw_to_check = ["why"])
    feature_df['which'] = feature_df["doc"].apply(question_binary, kw_to_check = ["which"])
    feature_df['how'] = feature_df["doc"].apply(question_binary, kw_to_check = ["how"])
  
    feature_df['how_much'] = df[1].apply(question_phrase_binary, phrases_to_check = ["how much"])
    feature_df['how_many'] = df[1].apply(question_phrase_binary, phrases_to_check = ["how many"])
    feature_df['how_long'] = df[1].apply(question_phrase_binary, phrases_to_check = ["how long"])
     
    #new features
    feature_df['what_is'] = feature_df["doc"].apply(what_is_question)
    feature_df['what_is_2'] = feature_df["doc"].apply(what_is_question_2)
    feature_df['what_is_3'] = feature_df["doc"].apply(what_is_question_3)
    feature_df['tell_me_question'] = feature_df["doc"].apply(tell_me_question)
    feature_df['n_chunks'] = feature_df["doc"].apply(num_noun_chunks)
    
    #questions (more than one word, e.g., "how about", "what about")
    feature_df['question_ph_2'] = df[1].apply(question_phrase, phrases_to_check = question_phrases_2)
    feature_df['question_ph_2_b'] = df[1].apply(question_phrase_binary, phrases_to_check = question_phrases_2)
   
    #? and it
    feature_df['ques_mark_it'] = feature_df["doc"].apply(question_mark_third_person)

    return feature_df

In [None]:
train_features = uttFeatures(train_df, train_feat_df)
test_features = uttFeatures(test_df, test_feat_df)

In [None]:
print_files = False
if print_files:
    train_features.to_pickle("../data/gbdt_features/train_features_step1.pkl")
    test_features.to_pickle("../data/gbdt_features/test_features_step1.pkl")