# Imports

In [1]:
import numpy as np
import pandas as pd
import sys
libraries = (('Numpy', np), ('Pandas', pd))

print("Python Version:", sys.version, '\n')
for lib in libraries:
    print('{0} Version: {1}'.format(lib[0], lib[1].__version__))

import VWCommands as vwc

Python Version: 3.6.5 |Anaconda, Inc.| (default, Apr 29 2018, 16:14:56) 
[GCC 7.2.0] 

Numpy Version: 1.16.4
Pandas Version: 0.23.0


# Function

In [2]:
def train_test_vw(train_address, num_labels, \
                  rollin, rollout, epochs, lr, \
                  affix, history, neighbor):
    '''
    Trains a Vowpal Wabbit model, tests model, generates predictions and raw scores. 
    Evaluates predictions using SkLearn.
    train_address - where to import the training data from.
    num_labels - number of classes in the dataset.
    rollin - Rollin Policy, 'learn' is recommended.
    rollout - Rollout Policy, 'mix' or maybe 'ref' are recommended.
    epoch - number of passes over the training data.
    lr - step size to convergance in stochastic gradient descent.
    affix - trains on the prefixes/suffixes of features.
    history - trains on previous features.
    neighbor -  trains on neighboring predictions. 
    OUT: prints & outputs the mean scores from evaluation using SkLearn.    
    '''
    fscore, precision, recall = vwc.single_experiment(train_address, num_labels, \
                  rollin, rollout, \
                  epochs, lr, affix, history, neighbor)
    print('fscore: {}'.format(fscore.mean()))
    print('precision: {}'.format(precision.mean()))
    print('recall: {}'.format(recall.mean()))
    
    return fscore.mean(), precision.mean(), recall.mean()

# Approach 1: Train Test 5 Entities
A model trained on only 2000 abstracts was insufficient to predict so many classes. Either the pattern of text between the entities needed to be more formulaic or we needed more data to predict the relationship among the classes.   

In [3]:
_, _, _ = train_test_vw('data/vw_train.txt', 11, \
                        'learn', 'mix', 1, 0.5, 1, 2, 1)

fscore: 0.5369741640914729
precision: 0.7125558633703833
recall: 0.46822367466210574


# Approach 2: Train Test 1 Entities
This method proved more successful. Each model was custom fit to each entity using the ExperimentSweep module. 

In [4]:
all_fscore = []
all_precision = []
all_recall = []

## 2.1 Protein

In [5]:
fscore, precision, recall = train_test_vw('data/protein_train.txt', 3,\
                                          'learn', 'ref', 6, 0.1, 6, 4, 1)
all_fscore.append(fscore)
all_precision.append(precision)
all_recall.append(recall)

fscore: 0.7567613845288713
precision: 0.8063454822590375
recall: 0.7265000650321212


## 2.2 Cell line

In [6]:
fscore, precision, recall = train_test_vw('data/cellline_test.txt', 3,\
                                          'learn', 'mix', 3, 0.5, 6, 4, 1)
all_fscore.append(fscore)
all_precision.append(precision)
all_recall.append(recall)

fscore: 0.7725322617119947
precision: 0.7941770366697729
recall: 0.7646742265907694


## 2.3 Cell type

In [7]:
fscore, precision, recall = train_test_vw('data/celltype_test.txt', 3,\
                                          'learn', 'mix', 4, 0.25, 6, 4, 1)
all_fscore.append(fscore)
all_precision.append(precision)
all_recall.append(recall)

fscore: 0.8277644303170709
precision: 0.9258764958577999
recall: 0.7656997730309865


## 2.4 RNA

In [8]:
fscore, precision, recall = train_test_vw('data/RNA_test.txt', 3,\
                                          'learn', 'mix', 6, 0.5, 6, 4, 1)
all_fscore.append(fscore)
all_precision.append(precision)
all_recall.append(recall)

fscore: 0.7823698643343039
precision: 0.9510122422731455
recall: 0.6927084789617094


## 2.5 DNA

In [9]:
fscore, precision, recall = train_test_vw('data/DNA_test.txt', 3,\
                                          'learn', 'mix', 4, 0.5, 6, 2, 1)
all_fscore.append(fscore)
all_precision.append(precision)
all_recall.append(recall)

fscore: 0.8086021654735162
precision: 0.9182737678205113
recall: 0.7387073097293116


# Collective performance of Approach 2

In [10]:
print('fscore: {}'.format(np.mean(all_fscore)))
print('precision: {}'.format(np.mean(all_precision)))
print('recall: {}'.format(np.mean(all_recall)))

fscore: 0.7896060212731515
precision: 0.8791370049760534
recall: 0.7376579706689796
