In [20]:
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd, numpy as np
import pickle
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from clean import clean
from augment import augment
from trial import Trial

################################################################################
# Begin global state
################################################################################

facts = pd.read_csv("rst_transitions.tab", sep="\t", quoting=3)

# some categories of columns
lex_feats = ["Top2-Stack", "Top1Span", "First-Queue"]
categorical_features = ['Top12-StackXML', 'Stack-QueueSType', "Stack", "genre", "Stack-QueueSameSent",
                        "Top12-StackSameSent",
                        'Top12-StackSameSent', 'Stack-QueueXML', 'Top12-StackSType',
                        "Top12-StackDir", "Stack-QueueDir", "First-QueueEduFunc", "Top1SpanEduFunc"]
numeric_features = ['First-QueueDist-To-Begin', 'Top2-StackLength-EDU', 'Top1-StackLength-EDU'] #'Top1-StacknEDUs']
scale_features = ['Top2-StackDist-To-End', 'First-Queue-Len']
text_features = ['First-Queue', 'Top1Span', 'Top2-Stack']

def load_bc():
    with open('bc3200.pickle', 'rb') as f:
        bc3200 = pickle.load(f)   
    # treat something like '10101111' as a binary integer encoding
    s2n = lambda s: sum([int(s[-i]) * (1 << i) for i in range(len(s))])
    
    converted = [(k, s2n(v)) for k,v in bc3200.items()]
    # sort by bc encoding
    return dict(sorted(converted, key=lambda x:x[1]))
bc_dict = load_bc()

# clean and augment data
data = facts.copy(deep=True)
data = clean(data)
data = augment(data)
data = data.sample(frac=1, random_state=42)
def split(data):
    splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_idx, test_idx in splitter.split(data, data["label"]):
        train = data.loc[train_idx]
        test = data.loc[test_idx]
    return train, test
train, test = split(data)
print(len(train))
train.head()

22702


Unnamed: 0,First-Queue,First-QueueDist-To-Begin,First-QueueEduFunc,First-QueueFullFunc,First-QueueFullPos,Queue,SeqPredFirstSpan,SeqPredTop1Span,Stack,Stack-QueueDir,...,Top1SpanEduFunc,Top1SpanFullFunc,Top1SpanFullPos,Top2-Stack,Top2-StackDist-To-Begin,Top2-StackDist-To-End,Top2-StackLength-EDU,genre,label,First-Queue-Len
19473,# People used to go to banks with gold and rec...,103.0,root,# nsubj root mark xcomp case obl case obl cc c...,# NNS VBD TO VB TO NNS IN NN CC VB `` NN '' IN...,NonEmpty,elabo,attri,MoreElem,NONE_NONE,...,ccomp,# nsubj ccomp punct #,# PRP VBZ . #,# the gov't says #,101,68.0,1.0,reddit,attribution-SN,71
4942,# were not only marked by her experience #,94.0,root,# aux:pass advmod advmod root case nmod:poss o...,# VBD RB RB VBN IN PRP$ NN #,NonEmpty,same,elabo,MoreElem,NONE,...,nsubj:pass,# det nsubj:pass case nmod:poss amod nmod #,# DT NNS IN PRP$ JJ NN #,"# In 1893 , Higuchi , her mother and her siste...",79,33.0,13.0,bio,Shift,42
18876,"# CMV , #",60.0,root,# root punct #,"# VB , #",NonEmpty,elabo,elabo,MoreElem,NONE_NONE,...,root,# det nsubj cop det advmod root punct #,# DT NN VBZ DT JJS CD . #,# This is n’t difficult or a big deal . #,57,74.0,1.0,reddit,joint-NN,9
9894,# My mother 's more particular -- #,42.0,root,# nmod:poss nsubj cop advmod root punct #,# PRP$ NN VBZ RBR JJ : #,NonEmpty,elabo,elabo,MoreElem,NONE_NONE,...,parataxis,# punct nmod:poss nsubj aux parataxis obj punc...,"# `` PRP$ NN MD VB PRP , '' #","# "" Why ? "" #",38,67.0,1.0,fiction,Shift,35
6330,# Career #,27.0,root,# root #,# NN #,NonEmpty,elabo,elabo,MoreElem,NONE_NONE,...,dep,# punct dep punct #,# -LSB- CD -RSB- #,# Although he had originally planned to attend...,22,92.0,4.0,bio,evidence-NS,10


In [55]:
identity_transformer = FunctionTransformer(lambda x: x)
def bc_encode_toks(toks):
    toks = toks.split(" ")
    return [str(bc_dict[x]) if x in bc_dict else -1 for x in toks]
bc_transformer = FunctionTransformer(bc_encode_toks)

def make_text_transformer(
    data,
    brown=False
):
    # Store a vocabulary per feature
    vocabs = {}
    transformers = []
    
    for feat in lex_feats:
        cvec = CountVectorizer(
            lowercase=False,
            ngram_range=(1, 2),
            # vocabulary=whitelist,   # You can work with your own whitelist
            max_features=1000,  # Or work with the top 1000 most frequent items, or...
            token_pattern=u"(?u)\\b\\S+\\b",  # Use these settings if you want to keep punctuation
            analyzer="word"
        )
        cvec.fit(data[feat])
        vocabs[feat] = cvec.get_feature_names()
        transformers.append(('count_' + feat, cvec, feat))
        
        if brown:
            cvec = CountVectorizer(
                lowercase=False,
                ngram_range=(1,2),
                max_features=10000,
                token_pattern=u"(?u)\\b\\S+\\b",
                analyzer="word"
            )

            transformers.append(("brown_" + feat, cvec, feat))

    return ColumnTransformer(transformers)

def make_categorical_transformer(categorical_encoding='ordinal'):
    assert categorical_encoding in ['ordinal', 'one_hot']
    categorical_steps = []
    categorical_steps += [('onehot', OneHotEncoder(handle_unknown='ignore'))] if categorical_encoding == 'one_hot' else []
    categorical_steps += [('ordinal', OrdinalEncoder())] if categorical_encoding == 'ordinal' else []
    return Pipeline(steps=categorical_steps)   

def make_transformer(
    data, 
    numeric_transformer=None,
    scale_transformer=None,
    text_transformer=None,
    categorical_transformer=None,
    extra_transformers=[]
):
    
    if scale_transformer is None:
        scale_transformer = StandardScaler()
    if numeric_transformer is None:
        numeric_transformer = identity_transformer
    if text_transformer is None:
        text_transformer = make_text_transformer(data)
    if categorical_transformer is None:
        categorical_transformer = make_categorical_transformer()

    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('sca', scale_transformer, scale_features),
        ('text', text_transformer, text_features),
        ('cat', categorical_transformer, categorical_features)
    ]
    transformers += extra_transformers
    return ColumnTransformer(transformers)

In [8]:
################################################################################
# Trials
################################################################################

# A trial is an object that conceptually means "a model run with a featureset"
# You hand it a ColumnTransformer in its constructor, and in return, it will:
# - evaluate on test for you
# - store the model in trial.model
# - store the preds in trial.preds
# - store the transformer in trial.transformer
# and more!

class XGBTrial(Trial):
    def __init__(self, transformer, use_test=False, **kwargs):
        self.method = "decision_function"
        super().__init__(**kwargs)

        eval_rows = test if use_test else train

        X = transformer.fit_transform(train)
        y = train["label"]

        model = XGBClassifier(
            nthread=-1
        )
        model.fit(X, y)

        # predict
        X_eval = transformer.transform(eval_rows)
        preds = model.predict(X_eval)

        # hold on to refs in case we want them later
        self.X = X
        self.y = y
        self.model = model
        self.preds = preds
        self.transformer = transformer

        # populate score attributes
        self._perf(eval_rows["label"], preds)

In [29]:
def get_column_names_from_ColumnTransformer(column_transformer):
    col_name = []
    for transformer_in_columns in column_transformer.transformers_[
                                  :-1]:  # the last transformer is ColumnTransformer's 'remainder'
        raw_col_name = transformer_in_columns[2]
        if isinstance(transformer_in_columns[1], Pipeline):
            transformer = transformer_in_columns[1].steps[-1][1]
        else:
            transformer = transformer_in_columns[1]
        try:
            names = transformer.get_feature_names()
        except AttributeError:  # if no 'get_feature_names' function, use raw column name
            names = raw_col_name
        if isinstance(names, np.ndarray):  # eg.
            col_name += names.tolist()
        elif isinstance(names, list):
            col_name += names
        elif isinstance(names, str):
            col_name.append(names)
    return col_name


# Default settings
oh_transformer = make_transformer(train)
ord_transformer = make_transformer(train)

print("Beginning XGB fitting...")
xgb_trial = XGBTrial(ord_transformer, use_test=True)
print(xgb_trial)
print("Fitting done. Predicting...")
#print(classification_report(test["label"], xgb_trial.preds))
#names = get_column_names_from_ColumnTransformer(ord_transformer)
#print(names[:50])

Beginning XGB fitting...




XGBTrial:
  accuracy: 0.6663143058491896
  micro_recall: 0.6663143058491896
  macro_recall: 0.2506690953703519
  micro_f1: 0.6663143058491896
  macro_f1: 0.2782958144415433

Fitting done. Predicting...
                 precision    recall  f1-score   support

          Shift       0.74      0.94      0.83      2861
  antithesis-NS       0.38      0.11      0.17        28
  antithesis-SN       0.33      0.04      0.07        27
 attribution-NS       0.79      0.67      0.72        33
 attribution-SN       0.68      0.67      0.68       147
  background-NS       0.00      0.00      0.00        38
  background-SN       0.00      0.00      0.00        73
       cause-NS       0.73      0.19      0.30        42
       cause-SN       0.00      0.00      0.00        33
circumstance-NS       0.58      0.45      0.51        78
circumstance-SN       0.67      0.19      0.29        64
  concession-NS       0.62      0.10      0.18        49
  concession-SN       0.47      0.18      0.26        44

  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [56]:
bc_transformer = make_transformer(
    train,
    text_transformer=make_text_transformer(train, brown=True)
)
xgb_with_bc = XGBTrial(bc_transformer, use_test=True)

  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [57]:
print(xgb_trial, name='standard')
print(xgb_with_bc, name='standard with brown cluster')

XGBTrial:
  accuracy: 0.6663143058491896
  micro_recall: 0.6663143058491896
  macro_recall: 0.2506690953703519
  micro_f1: 0.6663143058491896
  macro_f1: 0.2782958144415433

XGBTrial:
  accuracy: 0.6664904862579282
  micro_recall: 0.6664904862579282
  macro_recall: 0.2275469805498872
  micro_f1: 0.6664904862579282
  macro_f1: 0.2561203143017332

