### Baseline Models

We look at bag of words models as baseline models for comparison with the RNTN model. The two models considered here are *Naive Bayes* and *Support Vector Machine* models.

We evaluate the models for both root level and full tree node accuracy scores.

#### Extracting Phrases from the Treebank

The Sentiment Treebank dataset is in form of parsed trees. Here we generate all sub-phrases and their associated sentiments for evaluating full accuracy.

In [None]:
# Imports
import os
import sys
import numpy as np

In [None]:
# Set path to model code
PROJ_ROOT = os.pardir
sys.path.append(PROJ_ROOT)
from src.features.tree import Tree
from src.models.data_manager import DataManager

In [None]:
# Function to get sub-phrases for a single tree
def get_phrases(node):
    if node.isLeaf:
        return (np.asarray([node.word]), np.asarray([node.label]))
    else:
        left_phrases, left_labels = get_phrases(node.left)
        right_phrases, right_labels = get_phrases(node.right)
        curr_phrases = np.concatenate([np.asarray([node.text()]), left_phrases, right_phrases])
        curr_labels = np.concatenate([np.asarray([node.label]), left_labels, right_labels])
        return (curr_phrases, curr_labels)

In [None]:
# Get parsed trees
trees_path = '../src/data/interim/trainDevTestTrees_PTB/trees/'
x_train = DataManager(trees_path).x_train
x_dev = DataManager(trees_path).x_dev
x_test = DataManager(trees_path).x_test

In [None]:
# Get sub-phrases for every tree
X = []
y = []
for i in range(len(x_train)):
    X_tree, y_tree = get_phrases(x_train[i].root)
    X = np.concatenate([X, X_tree])
    y = np.concatenate([y, y_tree])

In [None]:
# Get sub-phrases for every cross validation set tree
X_data_dev = []
y_data_dev = []
for i in range(len(x_dev)):
    X_tree, y_tree = get_phrases(x_dev[i].root)
    X_data_dev = np.concatenate([X_data_dev, X_tree])
    y_data_dev = np.concatenate([y_data_dev, y_tree])

In [None]:
# Build vocabulary using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_data = vectorizer.fit_transform(np.concatenate([X, X_data_dev]))
X_data = X_data.tocsc()  # some versions of sklearn return COO format
y_data = np.concatenate([y, y_data_dev])

In [None]:
# Use Predefined split as train, dev data is already separate
from sklearn.metrics import accuracy_score
from sklearn.model_selection import PredefinedSplit, GridSearchCV

# Prepare data for training
validation_set_indexes = [-1] * len(X) + [0] * len(X_data_dev)
cv = PredefinedSplit(test_fold=validation_set_indexes)

#### Naive Bayes Model

In [None]:
# Simple naive bayes classifier
from sklearn.metrics import make_scorer, log_loss, accuracy_score

# Use MultinomialNB classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()

# Find the best hyper-parameter using GridSearchCV
params = {'alpha': [.1, 1, 5, 10, 50]}
model = GridSearchCV(clf, params, scoring=make_scorer(accuracy_score), cv=cv)

In [None]:
# Train model
model.fit(X_data.toarray(), y_data)

In [None]:
# Get sub-phrases for every test set tree
X_data_test = []
y_data_test = []
for i in range(len(x_test)):
    X_tree, y_tree = get_phrases(x_test[i].root)
    X_data_test = np.concatenate([X_data_test, X_tree])
    y_data_test = np.concatenate([y_data_test, y_tree])

In [None]:
# Vectorize
X_data_test_vec = vectorizer.fit_transform(X_data_test)

In [None]:
# Score model
# Print the accuracy on the test and training dataset
#training_accuracy = model.score(X_data.reshape(-1,1), y_data)
test_accuracy = model.score(X_data_test_vec.toarray(), y_data_test.astype(int))

#print("Accuracy on training data: {:2f}".format(training_accuracy))
print("Accuracy on test data:     {:2f}".format(test_accuracy))