In [1]:
import json, dill, re, pickle
import pandas as pd
import numpy as np
from scipy import sparse 
from sklearn.linear_model import LogisticRegressionCV, PassiveAggressiveClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import SparsePCA, TruncatedSVD, IncrementalPCA, MiniBatchSparsePCA
from sklearn.feature_selection import RFECV
from sklearn.utils import shuffle
import seaborn as sns
from pprint import pprint

import mlflow, mlflow.sklearn
import warnings, time

# for importing from sibling directories
import sys
import os
sys.path.append(os.path.abspath('../02_output'))
sys.path.append(os.path.abspath('../03_data'))

### 0. Load numpy npy and featurename files, data integrity checks 

In [2]:
##### Environment variables #####
var_dict = {}
LANG = 'en'
RUN = 'UD1_Auto_Run2' 
# run names are: (i) PTB_Gold_Run1, (ii) PTB_Auto_Run1, (iii) PTB_Gold_Run2, (iv) PTB_Auto_Run2, (v) UD1_Gold_Run2
# (vi) UD1_Auto_Run2

##### Mapping featuresets to Runs 
# folder name to variable name mappings 
feat_var_mapping = {'PitlerNenkova_Conn': 'PNconn', 'PitlerNenkova_Syn': 'PNsyn', 
                    'PitlerNenkova_ConnSyn':'PNconnsyn', 'PitlerNenkova_SynSyn':'PNsynsyn','Lin_etal': 'Lin', 
                    'Li_etal16': 'Li16'}
if 'Run2' in RUN:
    print('yes: UD2')
    [feat_var_mapping.pop(i) for i in ['PitlerNenkova_ConnSyn','PitlerNenkova_SynSyn', 'PitlerNenkova_Syn']]

X_filepath = '../02_output/{}/{}/{}/{}_{}.npz' # LANG, RUN, feat_class, dataset
y_filepath = '../02_output/{}/{}/{}/{}_{}.npy' # LANG, RUN, feat_class, dataset
featname_filepath = '../02_output/{}/{}/{}/{}_featnames.json'

if 'UD1_' in RUN: 
    print('yes: UD1')
    X_filepath = '../02_output/{}/{}/{}_UD1/{}_{}.npz' # LANG, RUN, feat_class, dataset
    y_filepath = '../02_output/{}/{}/{}_UD1/{}_{}.npy' # LANG, RUN, feat_class, dataset
    featname_filepath = '../02_output/{}/{}/{}_UD1/{}_featnames.json'

##### Loading the datasets
for dataset in ['train', 'dev', 'test']: 
    for feat_class in feat_var_mapping: 
        print(feat_class, dataset)
        var_name = 'featureset_'+feat_var_mapping[feat_class]+'_'+dataset
        globals()['X_'+var_name] \
        = sparse.load_npz(X_filepath.format(LANG, RUN, feat_class, 'X', dataset))
        globals()['y_'+var_name] \
        = np.load(y_filepath.format(LANG, RUN, feat_class, 'y', dataset))
        # zero_based = True needed, the default auto setting does not detect the feature set properly
        
        try:    var_dict[dataset][feat_class] = ['X_'+var_name, 'y_'+var_name]
        except: var_dict[dataset] = {feat_class: ['X_'+var_name, 'y_'+var_name]}
        
        with open(featname_filepath.format(LANG, RUN, feat_class,dataset), 'rb') as f:
            globals()['featnames_'+ feat_var_mapping[feat_class]] = json.load(f)

yes: UD2
yes: UD1
PitlerNenkova_Conn train
Lin_etal train
Li_etal16 train
PitlerNenkova_Conn dev
Lin_etal dev
Li_etal16 dev
PitlerNenkova_Conn test
Lin_etal test
Li_etal16 test


In [3]:
pprint(var_dict)

{'dev': {'Li_etal16': ['X_featureset_Li16_dev', 'y_featureset_Li16_dev'],
         'Lin_etal': ['X_featureset_Lin_dev', 'y_featureset_Lin_dev'],
         'PitlerNenkova_Conn': ['X_featureset_PNconn_dev',
                                'y_featureset_PNconn_dev']},
 'test': {'Li_etal16': ['X_featureset_Li16_test', 'y_featureset_Li16_test'],
          'Lin_etal': ['X_featureset_Lin_test', 'y_featureset_Lin_test'],
          'PitlerNenkova_Conn': ['X_featureset_PNconn_test',
                                 'y_featureset_PNconn_test']},
 'train': {'Li_etal16': ['X_featureset_Li16_train', 'y_featureset_Li16_train'],
           'Lin_etal': ['X_featureset_Lin_train', 'y_featureset_Lin_train'],
           'PitlerNenkova_Conn': ['X_featureset_PNconn_train',
                                  'y_featureset_PNconn_train']}}


In [4]:
# check alignment in the number of features across datasets
for dataset in var_dict:
    print('\n')
    for featureset in var_dict[dataset]:
        print(featureset, dataset, globals()[var_dict[dataset][featureset][0]].shape)



PitlerNenkova_Conn train (51161, 101)
Lin_etal train (51161, 32107)
Li_etal16 train (51161, 32050)


PitlerNenkova_Conn dev (2218, 101)
Lin_etal dev (2218, 32107)
Li_etal16 dev (2218, 32050)


PitlerNenkova_Conn test (2988, 101)
Lin_etal test (2988, 32107)
Li_etal16 test (2988, 32050)


### 1. Define the experiment featuresets and the experimental setup

In [5]:
experiments = {
                1: ['PitlerNenkova_Conn'], 
                2: ['PitlerNenkova_Syn'], 
                3: ['Li_etal16'],
                4: ['PitlerNenkova_Conn', 'PitlerNenkova_Syn'], 
                5: ['PitlerNenkova_Conn', 'PitlerNenkova_Syn', 'PitlerNenkova_ConnSyn'], 
                6: ['PitlerNenkova_Conn', 'PitlerNenkova_Syn', 'PitlerNenkova_ConnSyn', 'PitlerNenkova_SynSyn'], 
                7: ['PitlerNenkova_Conn', 'PitlerNenkova_Syn', 'PitlerNenkova_ConnSyn', 'PitlerNenkova_SynSyn', 'Lin_etal'],
               }
if 'Run2' in RUN:
    experiments = {
                1: ['PitlerNenkova_Conn'], 
                2: ['Li_etal16'], 
               } # PitlerNenkova_Conn to check performance change from PTB to UD. Li_etal16 is sota

#### Retrieve the X, y train, dev data 

In [6]:
def retrieve_Xytraindev(experiment_spec):
    features = experiment_spec
    __train = [globals()[var_dict['train'][i][0]] for i in features]
    X_train = sparse.hstack(__train)
    y_train = [globals()[var_dict['train'][i][1]] for i in features][0]
    
    
    __dev = [globals()[var_dict['dev'][i][0]] for i in features]
    X_dev = sparse.hstack(__dev)
    y_dev = [globals()[var_dict['dev'][i][1]] for i in features][0]
    
    __test = [globals()[var_dict['test'][i][0]] for i in features]
    X_test = sparse.hstack(__test)
    y_test = [globals()[var_dict['test'][i][1]] for i in features][0]
    
    return X_train, y_train, X_dev, y_dev, X_test, y_test

In [7]:
y_featureset_PNconn_train

array([1., 1., 1., ..., 0., 0., 0.])

In [8]:
CONNSTRING_FP = '../../03_data/en/explicit_connectives/ExpConn_{}exp_{}.dill'

def _get_connstring(connstring_fp, traintestset):
    '''
    Helper function to load the connective strings. Being used for per-connective error
    analysis. 
    '''
    # open and load the positive and negative examples
    with open(connstring_fp.format("pos", traintestset), 'rb') as f:
        pos = dill.load(f)
    # they have been saved as Class objects, so we extract the RawText 
    # of each of these and lowercase them
    pos = [i.Connective['RawText'].lower() for i in pos]
    
    with open(connstring_fp.format('neg', traintestset), 'rb') as f:
        neg = dill.load(f)
    neg = [i.Connective['RawText'].lower() for i in neg]
    
    # put them all together. Order is important: positive examples come first 
    connstrings = []
    connstrings.extend(pos)
    connstrings.extend(neg)
    
    return connstrings 

#### Define some helper functions

In [9]:
def _get_evalmetrics(actual, pred):
    """
    Helper function to produce the binary classification metrics we require 
    
    """
    acc = accuracy_score(actual, pred)
    f1macro = f1_score(actual, pred, average='macro') 
    f1micro = f1_score(actual, pred, average='micro')
    f1weight = f1_score(actual, pred, average='weighted')
    
    return acc, f1macro, f1micro, f1weight 

def _init_dimreducer(X_train, classifier, n_components=None, reduce_dim=None, random_state=42):
    """
    Helper function to instantiate the dimensionality reduction method to be used. 
    Input | 
    Output| 
    """
    if reduce_dim == 'SparsePCA':
        print('Starting dimension reduction with: {}'.format(reduce_dim))
        dimreducer = MiniBatchSparsePCA(n_components=n_components,
                                        alpha=1, ridge_alpha=0.01, n_iter=100, callback=None, 
                                        batch_size=3, verbose=False, shuffle=True, 
                                        n_jobs=-1, method='lars', random_state=random_state, 
                                        normalize_components=False)

    if reduce_dim == 'TruncatedSVD':
        print('Starting dimension reduction with: {}'.format(reduce_dim))
        dimreducer = TruncatedSVD(n_components=n_components, random_state=random_state)

    if reduce_dim == 'SRP' and X_train.shape[1]>10000:
        print('Starting dimension reduction with: {}'.format(reduce_dim))
        dimreducer = SparseRandomProjection(eps=0.1)

    if reduce_dim == 'RFECV' and X_train.shape[1]>10000:
        print('Starting dimension reduction with: {}'.format(reduce_dim))
        dimreducer = RFECV(classifier, step = 0.2, min_features_to_select=n_components,
                            cv = 3, n_jobs=-1, scoring='f1') 
                            # using f1 score instead of accuracy score 
        classifier = dimreducer
        
    return dimreducer

def _init_classifier(classifier_name, class_weight = None, random_state = None):
    """
    
    """
    if classifier_name == 'logreg':
        ##### LOGISTIC REGRESSION #####
        # setting lower Cs (stronger regularisation) since we have very sparse matrices.  
        # setting penalty to L1 as a form of feature selection, since we have so many features. 
        # l1 penalty usable only with liblinear or saga solver, but saga is faster 
        # fit_intercept = True since we have not normalised/standardised the featureset 
        classifier = LogisticRegressionCV(Cs = 10, fit_intercept=True, cv=10, dual=False, penalty='l2',
                scoring="accuracy", solver='saga', tol=0.0001, max_iter=100, class_weight=class_weight,
                n_jobs=-1, verbose=0, refit=True, intercept_scaling=1.0, multi_class='multinomial', random_state=random_state) 
    if classifier_name == 'passaggressive':
        ##### PASSIVE AGGRESSIVE CLASSIFIER #####
        # setting C = 1.0 - high regularisation since we have sparsity in the data
        # setting early_stopping=True and validation_fraction=0.2 to stop further training if not promising
        # unsure about the distibution of the data at this point, so not setting tol
        # C = 10, to match the max in logreg's Cs above. 
        classifier = PassiveAggressiveClassifier(C=10, fit_intercept=True, max_iter=None, tol=None,
                early_stopping=True, validation_fraction=0.2, n_iter_no_change=5, shuffle=True,
                verbose=0, loss='hinge', n_jobs=-1, random_state=random_state, warm_start=False,
                class_weight=class_weight, average=False, n_iter=None)
    
    if classifier_name == 'linearsvc':
        ##### LINEAR SVC ##### 
        # Using Linear SVC instead of SVM because it is faster 
        # and the task is a relatively 'simple' binary classification one
        classifier = LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001,
        C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1,
        class_weight=class_weight, verbose=0, random_state=random_state, max_iter=1000,)

    return classifier

# other notes: 
# 1. not including naive bayes because we have interaction features in the dataset. - possible to remove these 
#    features from the experimental set-up in order to try nb


In [10]:
def run_experiments(experiments, classifier, exp_id, shuffle_data = True, reduce_dim=None,
                    n_components=500, random_state=42, 
                    connstring_fp = '../../03_data/en/explicit_connectives/ExpConn_{}exp_{}.dill'):
    """
    Run the experiments defined, using an instance of a sklearn classifier. 
    """
    ### mlflow preliminaries 
    warnings.filterwarnings("ignore") #bypass model training alerts/messages, for mlflow.
    
    exp_id = mlflow.set_experiment(exp_id)
        
    ### running experiments 
    for experiment_num in experiments:
        print('FEATURESET COMPONENTS: ', experiments[experiment_num])
        exp_components = ["".join(re.findall(r'[A-Z_0-9]+', i)) for i in experiments[experiment_num]]
        # the experiment name is the cap letters, numbers and _ in featureset name
        exp_name = "_".join(exp_components) # for run_name; in this experiment set-up, 
                                            # allows us to see the various feature components
        
        X_train, y_train, X_dev, y_dev, X_test, y_test = retrieve_Xytraindev(experiments[experiment_num])
        connstrings_train = _get_connstring(connstring_fp, 'train')
        connstrings_dev = _get_connstring(connstring_fp, 'dev')
        connstrings_test = _get_connstring(connstring_fp, 'test')
        
        # shuffle the data  
        if shuffle_data == True:
            # shuffle the data 
            X_train, y_train, connstrings_train = shuffle(X_train, y_train, connstrings_train,
                                       random_state=random_state)
            X_dev, y_dev, connstrings_dev = shuffle(X_dev, y_dev, connstrings_dev, 
                                   random_state=random_state)
            X_test, y_test, connstrings_test = shuffle(X_test, y_test, connstrings_test, 
                                     random_state=random_state)
            print('data shuffled', X_train.shape, X_dev.shape)
        
        # dimension reduction
        if X_train.shape[1] > n_components and reduce_dim != None:
            _reduce_dim=reduce_dim 
            _n_components=n_components
            
            dimreducer = _init_dimreducer(X_train, classifier, n_components=n_components, 
                             reduce_dim=reduce_dim, random_state=random_state)
            if reduce_dim == 'SparsePCA':
                X_train = dimreducer.fit_transform(X_train.toarray())
                X_dev = dimreducer.transform(X_dev.toarray())
            elif reduce_dim == "RFECV":
                pass
            else: 
                X_train = dimreducer.fit_transform(X_train)
                X_dev = dimreducer.transform(X_dev)     
        else: 
            _reduce_dim=None
            _n_components='NA'
        
        print('Dimension reduction with {}, new data shape'.format(_reduce_dim), 
              X_train.shape, X_dev.shape)
        
        with mlflow.start_run(experiment_id=exp_id, run_name=exp_name, nested=True):
            print('starting experiment')
            start_time = time.time()
            
            # fit the classifier 
            classifier.fit(X_train, y_train)
            print('classifier fitted')


            # predict on train and test 
            y_train_pred = classifier.predict(X_train)
            y_dev_pred = classifier.predict(X_dev)
            y_test_pred = classifier.predict(X_test)
            
            # save the predictions, as well as the X, y and conn_strings (since these
            # have been shuffled here)
            np.save('results/' + RUN + '_' + exp_name  + '_y_train_pred', y_train_pred)
            np.save('results/' + RUN + '_' + exp_name  + '_y_test_pred', y_test_pred)
            np.save('results/' + RUN + '_' + exp_name  + '_y_train', y_train)
            np.save('results/' + RUN + '_' + exp_name  + '_y_test', y_test)            
            sparse.save_npz('results/' + RUN + '_' + exp_name  + '_X_train', X_train)
            sparse.save_npz('results/' + RUN + '_' + exp_name  + '_X_test', X_test)
            with open('results/' + RUN + '_' + exp_name  + '_cstring_train.txt', 'w+') as f:
                [f.write(i+'\n') for i in connstrings_train]
            with open('results/' + RUN + '_' + exp_name  + '_cstring_test.txt', 'w+') as f:
                [f.write(i+'\n') for i in connstrings_test]
            
            acc_train, f1macro_train, f1micro_train, f1weight_train = _get_evalmetrics(y_train, y_train_pred)
            acc_dev, f1macro_dev, f1micro_dev, f1weight_dev = _get_evalmetrics(y_dev, y_dev_pred)
            acc_test, f1macro_test, f1micro_test, f1weight_test = _get_evalmetrics(y_test, y_test_pred)
            
            ### Log parameter, metrics, and classifier to MLflow
            # params: use sklearn's .get_params() to log all the parameters for the classifier 
            print([('classifier_'+param, classifier.get_params()[param]) for param in classifier.get_params()])
            [mlflow.log_param('classifier_'+param, classifier.get_params()[param]) for param in classifier.get_params()]
            mlflow.log_param('Classifier', classifier.__class__.__name__)
            mlflow.log_param('Dimreducer', _reduce_dim)
            # Names may only contain alphanumerics, underscores (_), dashes (-), periods (.), spaces ( ), and slashes (/).
            try: [mlflow.log_param('dimreducer_'+param, dimreducer.get_params()[param]) \
                 for param in dimreducer.get_params()]
            except: pass 
            
            mlflow.log_metric("acc_train", acc_train)
            
            # metrics
            mlflow.log_metric("f1macro_train", f1macro_train)
            mlflow.log_metric("f1micro_train", f1micro_train)
            mlflow.log_metric("f1weight_train", f1weight_train)
            
            mlflow.log_metric("acc_dev", acc_dev)
            mlflow.log_metric("f1macro_dev", f1macro_dev)
            mlflow.log_metric("f1micro_dev", f1micro_dev)
            mlflow.log_metric("f1weight_dev", f1weight_dev)
            
            mlflow.log_metric("acc_dev", acc_test)
            mlflow.log_metric("f1macro_dev", f1macro_test)
            mlflow.log_metric("f1micro_dev", f1micro_test)
            mlflow.log_metric("f1weight_dev", f1weight_test)
            
            # classifier 
            # mlflow.sklearn.log_model(classifier, "model")
            print('model logged to mlflow')

            # evaluate
            print('Performance metrics...\n')
            print('train accuracy', acc_train)
            print('dev accuracy', acc_dev)
            print('test accuracy', acc_test,'\n')
            
            
            print('train f1macro', f1macro_train)
            print('dev f1macro', f1macro_dev)
            print('test f1macro', f1macro_test,'\n')
            
            print('train f1micro', f1micro_train)
            print('dev f1micro', f1micro_dev)
            print('test f1micro', f1micro_test,'\n')
            
            print('train f1weighted', f1weight_train)
            print('dev f1weighted', f1weight_dev)
            print('test f1weighted', f1weight_test,'\n')

            
            print("The time taken was: {}s".format((time.time()-start_time)/60))
            print('__________\n')


### 4. Running the experiments

#### UD featureset 

Class weights specified

In [11]:
##### Environment variables 
CLASS_WEIGHT={0: 0.7021400100615302, 1: 0.29785998993846985}
RANDOM_STATE = 42

# note: setting high Cs for L1 regularisation is akin to dimensionality reduction, 
# so we need to be mindful of settings for reduce_dim

for classifier_name in ['logreg']:
    # instantiate the classifer 
    classifier = _init_classifier(classifier_name, class_weight = CLASS_WEIGHT, 
                                  random_state = RANDOM_STATE)
    
    # run the experimental pipeline
    run_experiments(experiments, classifier, exp_id = RUN+'_'+LANG, 
                    shuffle_data = True, reduce_dim=None,
                    n_components=5000, connstring_fp = CONNSTRING_FP) 


FEATURESET COMPONENTS:  ['PitlerNenkova_Conn']
data shuffled (51161, 101) (2218, 101)
Dimension reduction with None, new data shape (51161, 101) (2218, 101)
starting experiment
classifier fitted
[('classifier_Cs', 10), ('classifier_class_weight', {0: 0.7021400100615302, 1: 0.29785998993846985}), ('classifier_cv', 10), ('classifier_dual', False), ('classifier_fit_intercept', True), ('classifier_intercept_scaling', 1.0), ('classifier_l1_ratios', None), ('classifier_max_iter', 100), ('classifier_multi_class', 'multinomial'), ('classifier_n_jobs', -1), ('classifier_penalty', 'l2'), ('classifier_random_state', 42), ('classifier_refit', True), ('classifier_scoring', 'accuracy'), ('classifier_solver', 'saga'), ('classifier_tol', 0.0001), ('classifier_verbose', 0)]
model logged to mlflow
Performance metrics...

train accuracy 0.845741873692852
dev accuracy 0.827321911632101
test accuracy 0.8417001338688086 

train f1macro 0.7868321936567195
dev f1macro 0.766063328852843
test f1macro 0.78902762

#### Saving the model for production use.

In [None]:
with open('models/model_UD1_Auto_Run2.model', 'wb+') as f:
    pickle.dump([featnames_Li16, classifier], f) 