In [1]:
import numpy as np
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler

import cvxpy

## Output in Weka format

In [2]:
import os
def predict_for_test(test, predict, probability, path):
    with open(path, 'w') as f:
        f.write("=== Predictions on test data ===\n")
        f.write(" inst#     actual  predicted error prediction\n")
        for i in range(len(test)):
            string = [str(i + 1)]
            if test[i] == 1:
                string.append("1:positive")
            else:
                string.append("2:negative")
            if predict[i] == 1:
                string.append("1:positive")
            else:
                string.append("2:negative")
            if test[i] == predict[i]:
                string.append(" " * 5)
            else:
                string.append(" " * 2 + "+" + " " * 2)
            if predict[i] == 1:
                string.append(str(probability[i][1]))
            else:
                string.append(str(probability[i][0]))
            string = " ".join(string) + "\n"
            f.write(string)   
    

## Load data

In [3]:
import arff

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

#import data
def importData(path):
    dataset = arff.load(open(path, 'rb'))
    data = np.array(dataset['data'])
    #print data[:10]

    #extract features and labels
    features = []
    labels = []
    for d in data:
        f = []
        for i in range(len(d) - 1):
            num = float(d[i])
            if int(num) == num:
                num = int(num)
            f.append(num)
        features.append(f)

        if d[-1] == "positive":
            labels.append(1)
        else:
            labels.append(0)
    return np.asarray(features), np.asarray(labels)

---
## Word embedding vectors

In [4]:
# Map each word to an index
ndim = 300
glove_path = "../data/glove_embeddings/glove.6B.{}d.txt".format(ndim)
with open(glove_path, "rb") as lines:
    w2idx = {line.split()[0].decode("utf-8"): i for i, line in enumerate(lines)}

In [5]:
vectors = np.empty((len(w2idx), ndim), dtype=np.float)
with open(glove_path, "rb") as lines:
    for i, line in enumerate(lines):
        vectors[i] = np.asarray(map(float, line.split()[1:]))

In [6]:
import string
from nltk.corpus import stopwords

words_to_exclude = frozenset(string.punctuation) | frozenset(["..", "..."])
words_to_exclude |= frozenset(stopwords.words("english"))

---

## Rank propagation

---
Get vector indicating which example belongs to which question group


In [7]:
def get_QA_group_count(infile):        
    line = infile.readline().strip()
    if line == "":
        return None
    
    if not line.startswith("<QApairs"):
        raise Exception("Invalid data format: {}<-----".format(line))
    
    sentence_count = 0
    while not line.strip().startswith("</QApairs"):
        line = infile.readline().replace('\t', ' ')                    
        if line.strip().lower().startswith("<positive") or line.strip().lower().startswith("<negative"):
            sentence_count += 1
    
    return sentence_count

def get_QA_group_indicators(filepath):    
    with open(filepath) as infile:
        indicators = []
        qn_number = 0
        while infile:
            count = get_QA_group_count(infile)

            # Check for EOF
            if count == None:
                break

            if count > 0:
                indicators += ([qn_number] * count)
                qn_number += 1
        
    return np.asarray(indicators)

In [8]:
# Get a question and its candidate answers
def get_QA_group(infile):
    question = []
    answers = []
    line = infile.readline().strip()
    if line == "":
        return None
    
    if not line.startswith("<QApairs"):
        raise Exception("Invalid data format: {}<-----".format(line))
        
    while not line.strip().startswith("</QApairs"):
        line = infile.readline().replace('\t', ' ')
        if line.strip().lower().startswith("<question"):
            line = infile.readline().replace('\t', ' ')
            question.append(line.strip())
        elif line.strip().lower().startswith("<positive"):
            line = infile.readline().replace('\t', ' ')
            answers.append(("positive", line.strip()))
        elif line.strip().lower().startswith("<negative"):
            line = infile.readline().replace('\t', ' ')
            answers.append(("negative", line.strip()))
    
    return {"question": question, "answers": answers}                

In [9]:
from nltk.tokenize import WhitespaceTokenizer

def extract_vector(sentence, exclude, w2idx, wordvectors):
    """Compute the vector for a sentence by averaging the words in the sentence that has word embeddings"""
    # Tokenize sentence
    splitter = WhitespaceTokenizer()
    tokens = splitter.tokenize(sentence)    
    # Remove stopwords and punctuation
    words = [t.lower() for t in tokens if t.lower() not in exclude ]
    
    # If we cannot find any words, we can consider returning a vector of 0
    # and set the resulting cosine similarity to 0 otherwise will result in nan
    # because cosine similarity will divide by 0.
    assert(len(words) > 0)
            
    # Average words in sentence that are in word matrix
    try:
        avg_vec = np.mean([wordvectors[w2idx[w]] for w in words if w in w2idx ] 
                                                 or [np.zeros(wordvectors.shape[1])], 
                           axis=0)
        if not np.any(avg_vec):
            print("Tokens cannot be found: {}".format(words))
        assert(np.any(avg_vec))
        return avg_vec
    except UnicodeDecodeError:
        print(line.strip())
        raise

In [10]:
def compute_pairwise_distance_matrix(X, k, p=2):
    """Compute pairwise distances between each point in X
    and its k-nearest neighbors."""

    from scipy.spatial import KDTree
    kdtree = KDTree(X)
    A = np.zeros((X.shape[0], X.shape[0]), dtype=np.float)
    for i, x in enumerate(X):
        distances, idxs = kdtree.query(x, k+1, p)  # k+1 as one pt is the pt itself.
        for d, j in zip(distances, idxs):
            A[i, j] = d

    # p = 2 corresponds to gaussian kernel. p = 1 corresponds to Laplacian kernel.
    if p == 2:  # Store squared euclidean for L2 distance otherwise if p = 1 just store absolute dist.
        A = A ** 2

            
    return A

Compute weight matrix (i.e., the Graph Laplacian) $L$ for each set of question and its candidate answers

In [11]:
import scipy as sp

# Compute the scores/features for a dataset
def get_weight_matrix(input_file, n_neighbors=5, sigma=1.0, eps=0.0001, p=2):
    """Compute weight matrix for question and answer sentences 
    """
    with open(input_file) as infile:
        num_questions = 0
        while infile:
            group = get_QA_group(infile)

            # Check for EOF
            if group is None:
                break
                
            # Extract question vector
            question = group["question"]
            qvec = extract_vector(question[0], words_to_exclude, w2idx, vectors)

            scores = []
            answer_vectors = []
            for (label, sentence) in group["answers"]:
                # Compute similarity with question vector
                vec = extract_vector(sentence, words_to_exclude, w2idx, vectors) # TODO: Pass these in as args
                answer_vectors.append(vec)
                cosine_distance = sp.spatial.distance.cosine(qvec, vec)
                scores.append((label, cosine_distance))

            # Compute pairwise distances between the answer vectors for K nearest neighbor
            k = min(n_neighbors, len(answer_vectors) - 1) # Minus 1 because have to exclude itself
            # Not enough to do rank propagation. Just keep original scores.
            if k < 0:
                yield None, None
            elif k == 0:
                yield 1, None
            else:                
                answer_vectors = np.vstack(answer_vectors)
                W = compute_pairwise_distance_matrix(answer_vectors, k, p)
                W = np.maximum(W, W.T)  # Ensure W symmetric.
                W[W > 0] = np.exp(- W[W > 0] / (2 * sigma**2))  # Apply gaussian kernel
                D = np.diag(np.sum(W, axis=1))  # Row sum of W
                L = D - W
#                 L = L + eps * np.eye(len(answer_vectors))  # Improve the condition of the graph laplacian                
                Dinvsqrt = np.sqrt(np.linalg.pinv(D))                
                # Need to ensure that Dinvsqrt does not have NAN due to division by 0
                assert(not np.any(np.isnan(Dinvsqrt)))                
                L = Dinvsqrt.dot(L).dot(Dinvsqrt)  # Normalized graph laplacian
                
#                 assert(is_pos_def(Dinvsqrt))
#                 assert(is_pos_def(L))
                
                yield L.shape[0], L
            
            num_questions += 1

---
Load question answer similarity values from file (probably should compute it here).

**NOTE**: Similarity is the wrong term to use here. The values in the file are actually cosine **distances**.
To convert it to similarity, we need to subtract it from 1.


In [12]:
def load_similarity_features(filepath):
    features = []
    labels = []
    map_label = {"positive": 1, "negative": 0}
    with open(filepath) as infile:
        for line in infile:
            label, score = line.strip().split(',')
            score = float(score)
            label = map_label[label]
            features.append(score)
            labels.append(label)
            
    return np.asarray(features).reshape(-1, 1), np.asarray(labels)

In [13]:
#import training data and test data
train_datapath = "../myclassify/qa.train.arff"
dev_datapath = "../myclassify/qa.dev.arff"
test_datapath = "../myclassify/qa.test.arff"

X_train, y_train = importData(train_datapath)
X_dev, y_dev = importData(dev_datapath)
X_test, y_test = importData(test_datapath)

In [14]:
train_file = "../data/answerSelectionExperiments/data/train-less-than-40.xml"
dev_file = "../data/answerSelectionExperiments/data/dev-less-than-40.xml"
test_file = "../data/answerSelectionExperiments/data/test-less-than-40.xml"
dev_qn_group_indicators = get_QA_group_indicators(dev_file)
qn_group_indicators = get_QA_group_indicators(test_file)

In [15]:
X_sim_train, y_sim_train = load_similarity_features("../data/features/glove_embedding_sentence_similarities_train_300.txt")
X_sim_dev, y_sim_dev = load_similarity_features("../data/features/glove_embedding_sentence_similarities_dev_300.txt")
X_sim_test, y_sim_test = load_similarity_features("../data/features/glove_embedding_sentence_similarities_test_300.txt")

X_combined_train = np.hstack((X_train, X_sim_train))
X_combined_dev = np.hstack((X_dev, X_sim_dev))
X_combined_test = np.hstack((X_test, X_sim_test))

# Scale combined data
scaler = RobustScaler()
scaler.fit(X_combined_train)
X_comb_scaled_train = scaler.transform(X_combined_train)
X_comb_scaled_dev = scaler.transform(X_combined_dev)
X_comb_scaled_test = scaler.transform(X_combined_test)

# Only normalize the similarity scores
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaler.fit(X_sim_train)
X_sim_train = scaler.transform(X_sim_train)
X_sim_dev = scaler.transform(X_sim_dev)
X_sim_test = scaler.transform(X_sim_test)
X_comb_scaledsim_train = np.hstack((X_train, X_sim_train))
X_comb_scaledsim_dev = np.hstack((X_dev, X_sim_dev))
X_comb_scaledsim_test = np.hstack((X_test, X_sim_test))

---
## Propagate rank score


In [16]:
from cvxpy import Variable, Minimize, norm, quad_form, Problem

In [17]:
def is_pos_def(x):
    """Check if a matrix is positive definite. For debugging purposes."""
    return np.all(np.linalg.eigvals(x) > 0)

In [18]:
# Version 2 that uses the ans_type_match and question answer similarity weights
def propagate_scores(r, L, ans_type_match, ans_sim_weights, alpha=1.0, gamma=1.0, loss_type=1):
    """Solve convex optimization problem to get new scores"""
        
    n = r.size
    y = Variable(n)    
    assert(len(ans_type_match) == n and len(ans_sim_weights) == n)
    
    # If no type match we just ignore the type term
    if not np.any(ans_type_match):
        objective = Minimize( norm(r - y, loss_type) + alpha * quad_form(y, L) )
    else:
        type_term = sum( ans_sim_weights[i] * cvxpy.abs(1 - y[i]) 
                        for i, match in enumerate(ans_type_match) if match == 1 )
        objective = Minimize( norm(r - y, loss_type) + alpha * quad_form(y, L) + gamma * type_term)
            
    constraints = [0 <= y, y <= 1]
    prob = Problem(objective, constraints)    

    # The optimal objective is returned by prob.solve().
    result = prob.solve(verbose=False)      
    assert(prob.status == "optimal")
    
    return y.value.flatten().tolist()[0]

In [19]:
def get_qn_answer_match_indicators(filepath):
    with open(filepath) as infile:
        return np.asarray([int(x.strip()) for x in infile])
    
def get_qn_answer_sim_weights(filepath):
    with open(filepath) as infile:
        return np.asarray([float(x.split(',')[1]) for x in infile])

In [20]:
def rank_propagation(data_filepath, qn_match_filepath, qn_simweights_filepath, r, 
                     alpha=1.0, sigma=1.0, n_neighbors=5, gamma=1.0, 
                     loss_type=1, 
                     pair_similarity_type=2):
    total_count = 0

    # Get qn group indicator
    qn_group_indicators = get_QA_group_indicators(data_filepath)
    
    # Get qn answer type match
    qn_ans_type_match = get_qn_answer_match_indicators(qn_match_filepath)
    
    # Get qn answer similarity weights
    # The weights are actually distances so we subtract them from 1 to convert distance to similarity
    qn_ans_sim_weights = 1 - get_qn_answer_sim_weights(qn_simweights_filepath)
    
    qn_number = 0  # Current question number (NOTE: This is not ID in XML)

    scores = []  # To store the final refined scores
    # L is actually the graph Laplacian matrix
    for (count, L) in get_weight_matrix(data_filepath, 
                                        n_neighbors, sigma, 
                                        p=pair_similarity_type):
        # Skip question without candidate answers
        if count is None:
            continue

        # Not enough points to propagate. Just use original value.
        MIN_NUM_CANDIDATES = 1
        if count <= MIN_NUM_CANDIDATES:
            assert(r[qn_group_indicators == qn_number].size == count)
            scores += r[qn_group_indicators == qn_number].tolist()
        else:
            # Get indicator vector for which answer has matching type
            ans_type_match = qn_ans_type_match[qn_group_indicators == qn_number]
            
            # Get question / answer similarity weights
            ans_sim_weights = qn_ans_sim_weights[qn_group_indicators == qn_number]                        
            
            # Propagate and append new scores
            assert(r[qn_group_indicators == qn_number].size == L.shape[0])
            new_scores = propagate_scores(r[qn_group_indicators == qn_number],
                                          L, 
                                          ans_type_match, ans_sim_weights,
                                          alpha, gamma, loss_type)                
            scores += new_scores

        qn_number += 1
        total_count += count

    return np.asarray(scores)

## Train basic classifier to give input ranks for Rank Propagation

In [21]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

Experiments with using RandomTreeEmbedding before passing to LR. Also possible to try kernel embedding for explicit nonlinear feature mapping before applying LR.

In [22]:
from sklearn.ensemble import RandomTreesEmbedding
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

rt = RandomTreesEmbedding(max_depth=3, n_estimators=400,
                          random_state=3713)

rt_lm = LogisticRegression(C=0.01)
pipeline = make_pipeline(rt, rt_lm)
pipeline.fit(X_train, y_train)

Pipeline(steps=[('randomtreesembedding', RandomTreesEmbedding(max_depth=3, max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=400, n_jobs=1, random_state=3713,
           sparse_output=True, verbo...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [23]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.82267633487145686

In [24]:
scores = pipeline.predict_proba(X_test)[:, 1]
P = np.hstack(((1 - scores).reshape(-1, 1), scores.reshape(-1, 1)))
predict_for_test(y_test, y_pred, P, "rt_embed_LR.txt")

The above code that uses Random Forest to perform embedding before training LR does not work well

---

In [25]:
from sklearn.naive_bayes import BernoulliNB, GaussianNB

# nb = GaussianNB()
paras = [0.01, 0.1, 1, 2, 10, 50, 100]
for para in paras:
    nb = BernoulliNB(alpha=para)
    nb.fit(X_train, y_train)

    #dev set
    y_dev_pred = nb.predict(X_dev)
    y_dev_prob = nb.predict_proba(X_dev)
    dev_path = "../myclassify/test_res/NB-dev/NB-" + str(para) + ".txt"
    predict_for_test(y_dev, y_dev_pred, y_dev_prob, dev_path)

    #test set
    y_pred = nb.predict(X_test)
    y_prob = nb.predict_proba(X_test)
    test_path = "../myclassify/test_res/NB-test/NB-" + str(para) + ".txt"
    predict_for_test(y_test, y_pred, y_prob, test_path)

    #test result
    print("alpha: " + str(para))
    print("accuracy: {}".format(accuracy_score(y_test, y_pred)))
    print("f1: {}".format(f1_score(y_test, y_pred)))
    print(confusion_matrix(y_test, y_pred))
    
#best MAP/MRR for dev when para = 2

alpha: 0.01
accuracy: 0.732366512854
f1: 0.316498316498
[[1017  216]
 [ 190   94]]
alpha: 0.1
accuracy: 0.733684904417
f1: 0.317567567568
[[1019  214]
 [ 190   94]]
alpha: 1
accuracy: 0.735003295979
f1: 0.311643835616
[[1024  209]
 [ 193   91]]
alpha: 2
accuracy: 0.73566249176
f1: 0.305025996534
[[1028  205]
 [ 196   88]]
alpha: 10
accuracy: 0.755438365194
f1: 0.285163776493
[[1072  161]
 [ 210   74]]
alpha: 50
accuracy: 0.800263678312
f1: 0.155988857939
[[1186   47]
 [ 256   28]]
alpha: 100
accuracy: 0.812788398154
f1: 0.0838709677419
[[1220   13]
 [ 271   13]]


In [26]:
from sklearn.ensemble import GradientBoostingClassifier

# Probably don't try this.
# GBM
paras1 = [50, 100, 400, 800]
paras2 = [2, 3, 6, 12, 24]
for para1 in paras1:
    for para2 in paras2:    
        gbm = GradientBoostingClassifier(n_estimators=para1, max_depth=para2, random_state=47156)
        gbm.fit(X_train, y_train)
        
        #dev set
        y_dev_pred = gbm.predict(X_dev)
        y_dev_prob = gbm.predict_proba(X_dev)
        dev_path = "../myclassify/test_res/GB-dev/GB-" + str(para1) + "-" + str(para2) + ".txt"
        predict_for_test(y_dev, y_dev_pred, y_dev_prob, dev_path)

        #test set
        y_pred = gbm.predict(X_test)
        y_prob = gbm.predict_proba(X_test)
        test_path = "../myclassify/test_res/GB-test/GB-" + str(para1) + "-" + str(para2) + ".txt"
        predict_for_test(y_test, y_pred, y_prob, test_path)
        
        #test result
        print("n_estimators: " + str(para1) + "; max_depth: " + str(para2))
        print("accuracy: {}".format(accuracy_score(y_test, y_pred)))
        print("f1: {}".format(f1_score(y_test, y_pred)))
        print(confusion_matrix(y_test, y_pred))

#best MAP/MRR for dev when para1 = 100, para2 = 2

n_estimators: 50; max_depth: 2
accuracy: 0.827949901121
f1: 0.25641025641
[[1211   22]
 [ 239   45]]
n_estimators: 50; max_depth: 3
accuracy: 0.827290705339
f1: 0.310526315789
[[1196   37]
 [ 225   59]]
n_estimators: 50; max_depth: 6
accuracy: 0.819380355966
f1: 0.308080808081
[[1182   51]
 [ 223   61]]
n_estimators: 50; max_depth: 12
accuracy: 0.817402768622
f1: 0.298734177215
[[1181   52]
 [ 225   59]]
n_estimators: 50; max_depth: 24
accuracy: 0.785102175346
f1: 0.320833333333
[[1114  119]
 [ 207   77]]
n_estimators: 100; max_depth: 2
accuracy: 0.831245880026
f1: 0.311827956989
[[1203   30]
 [ 226   58]]
n_estimators: 100; max_depth: 3
accuracy: 0.835860250494
f1: 0.385185185185
[[1190   43]
 [ 206   78]]
n_estimators: 100; max_depth: 6
accuracy: 0.823994726434
f1: 0.327455919395
[[1185   48]
 [ 219   65]]
n_estimators: 100; max_depth: 12
accuracy: 0.825972313777
f1: 0.333333333333
[[1187   46]
 [ 218   66]]
n_estimators: 100; max_depth: 24
accuracy: 0.786420566908
f1: 0.319327731092

In [27]:
import itertools
# RF
estimators = [100, 150, 200, 300, 400]
criterions = ["gini", "entropy"]
depths = [8, 11, 16, 20]
splits = [2, 4, 6, 8]
weights = [None, "balanced", "balanced_subsample"]
para_list = [estimators, criterions, depths, splits, weights]
paras = list(itertools.product(*para_list))

for para in paras:
    rf = RandomForestClassifier(n_estimators=para[0], 
                                    criterion=para[1], 
                                    max_depth=para[2],
                                    min_samples_split = para[3],
                                    class_weight=para[4],
                                    random_state=73514)
    rf.fit(X_train, y_train)
    para_string = "-".join([str(p) for p in para])
    #dev set
    y_dev_pred = rf.predict(X_dev)
    y_dev_prob = rf.predict_proba(X_dev)
    dev_path = "../myclassify/test_res/RF-dev/RF-" + para_string + ".txt"
    predict_for_test(y_dev, y_dev_pred, y_dev_prob, dev_path)

    #test set
    y_pred = rf.predict(X_test)
    y_prob = rf.predict_proba(X_test)
    test_path = "../myclassify/test_res/RF-test/RF-" + para_string + ".txt"
    predict_for_test(y_test, y_pred, y_prob, test_path)

    #test result
    print para_string
    print("accuracy: {}".format(accuracy_score(y_test, y_pred)))
    print("f1: {}".format(f1_score(y_test, y_pred)))
    print(confusion_matrix(y_test, y_pred))
    # predict_for_test(y_test, y_pred, rf.predict_proba(X_test), "rf.txt")
    
#best MAP/MRR for dev when 150-entropy-16-8-balanced

100-gini-8-2-None
accuracy: 0.81608437706
f1: 0.0542372881356
[[1230    3]
 [ 276    8]]
100-gini-8-2-balanced
accuracy: 0.803559657218
f1: 0.493197278912
[[1074  159]
 [ 139  145]]
100-gini-8-2-balanced_subsample
accuracy: 0.794330916282
f1: 0.474747474747
[[1064  169]
 [ 143  141]]
100-gini-8-4-None
accuracy: 0.817402768622
f1: 0.0735785953177
[[1229    4]
 [ 273   11]]
100-gini-8-4-balanced
accuracy: 0.790375741595
f1: 0.480392156863
[[1052  181]
 [ 137  147]]
100-gini-8-4-balanced_subsample
accuracy: 0.786420566908
f1: 0.477419354839
[[1045  188]
 [ 136  148]]
100-gini-8-6-None
accuracy: 0.818721160185
f1: 0.061433447099
[[1233    0]
 [ 275    9]]
100-gini-8-6-balanced
accuracy: 0.790375741595
f1: 0.475247524752
[[1055  178]
 [ 140  144]]
100-gini-8-6-balanced_subsample
accuracy: 0.791694133158
f1: 0.471571906355
[[1060  173]
 [ 143  141]]
100-gini-8-8-None
accuracy: 0.816743572841
f1: 0.0413793103448
[[1233    0]
 [ 278    6]]
100-gini-8-8-balanced
accuracy: 0.800263678312
f1: 0.4

In [29]:
# LR
# lr = LogisticRegression(C=3, class_weight="balanced")
# lr = LogisticRegression(C=0.01)
penalties = ["l1", "l2"]
cs = [0.01, 0.1, 1, 10, 100, 1000]

for para1 in penalties:
    for para2 in cs:
        lr = LogisticRegression(penalty = para1, C=para2, max_iter=1e8)
        lr.fit(X_train, y_train)
    
        #dev set
        y_dev_pred = lr.predict(X_dev)
        y_dev_prob = lr.predict_proba(X_dev)
        dev_path = "../myclassify/test_res/LG-dev/LG-" + str(para1) + "-" + str(para2) + ".txt"
        predict_for_test(y_dev, y_dev_pred, y_dev_prob, dev_path)

        #test set
        y_pred = lr.predict(X_test)
        y_prob = lr.predict_proba(X_test)
        test_path = "../myclassify/test_res/LG-test/LG-" + str(para1) + "-" + str(para2) + ".txt"
        predict_for_test(y_test, y_pred, y_prob, test_path)
        
        #test result
        print("penalty: " + str(para1) + "; C: " + str(para2))
        print("accuracy: {}".format(accuracy_score(y_test, y_pred)))
        print("f1: {}".format(f1_score(y_test, y_pred)))
        print(confusion_matrix(y_test, y_pred))

#best MAP/MRR for dev when l1-10

penalty: l1; C: 0.01
accuracy: 0.829927488464
f1: 0.208588957055
[[1225    8]
 [ 250   34]]
penalty: l1; C: 0.1
accuracy: 0.832564271589
f1: 0.270114942529
[[1216   17]
 [ 237   47]]
penalty: l1; C: 1
accuracy: 0.828609096902
f1: 0.244186046512
[[1215   18]
 [ 242   42]]
penalty: l1; C: 10
accuracy: 0.828609096902
f1: 0.244186046512
[[1215   18]
 [ 242   42]]
penalty: l1; C: 100
accuracy: 0.829268292683
f1: 0.244897959184
[[1216   17]
 [ 242   42]]
penalty: l1; C: 1000
accuracy: 0.829268292683
f1: 0.244897959184
[[1216   17]
 [ 242   42]]
penalty: l2; C: 0.01
accuracy: 0.837837837838
f1: 0.28488372093
[[1222   11]
 [ 235   49]]
penalty: l2; C: 0.1
accuracy: 0.830586684245
f1: 0.263610315186
[[1214   19]
 [ 238   46]]
penalty: l2; C: 1
accuracy: 0.827949901121
f1: 0.24347826087
[[1214   19]
 [ 242   42]]
penalty: l2; C: 10
accuracy: 0.827949901121
f1: 0.239067055394
[[1215   18]
 [ 243   41]]
penalty: l2; C: 100
accuracy: 0.827949901121
f1: 0.239067055394
[[1215   18]
 [ 243   41]]
pena

In [31]:
X_sim_train, y_sim_train = load_similarity_features("../data/features/glove_embedding_sentence_similarities_train_300.txt")
X_sim_dev, y_sim_dev = load_similarity_features("../data/features/glove_embedding_sentence_similarities_dev_300.txt")
X_sim_test, y_sim_test = load_similarity_features("../data/features/glove_embedding_sentence_similarities_test_300.txt")

# Only normalize the similarity scores
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaler.fit(X_sim_train)
X_sim_train = scaler.transform(X_sim_train)
X_sim_dev = scaler.transform(X_sim_dev)
X_sim_test = scaler.transform(X_sim_test)
X_comb_scaledsim_train = np.hstack((X_train, X_sim_train))
X_comb_scaledsim_dev = np.hstack((X_dev, X_sim_dev))
X_comb_scaledsim_test = np.hstack((X_test, X_sim_test))

In [33]:
penalties = ["l1", "l2"]
cs = [0.01, 0.1, 1, 10, 100, 1000]

for para1 in penalties:
    for para2 in cs:
        lr = LogisticRegression(penalty = para1, C=para2, max_iter=1e8)
        lr.fit(X_comb_scaledsim_train, y_train)
    
        #dev set
        y_dev_pred = lr.predict(X_comb_scaledsim_dev)
        y_dev_prob = lr.predict_proba(X_comb_scaledsim_dev)
        dev_path = "../myclassify/test_res/LGSIM-dev/LGSIM-" + str(para1) + "-" + str(para2) + ".txt"
        predict_for_test(y_dev, y_dev_pred, y_dev_prob, dev_path)

        #test set
        y_pred = lr.predict(X_comb_scaledsim_test)
        y_prob = lr.predict_proba(X_comb_scaledsim_test)
        test_path = "../myclassify/test_res/LGSIM-test/LGSIM-" + str(para1) + "-" + str(para2) + ".txt"
        predict_for_test(y_test, y_pred, y_prob, test_path)
        
        #test result
        print("penalty: " + str(para1) + "; C: " + str(para2))
        print("accuracy: {}".format(accuracy_score(y_test, y_pred)))
        print("f1: {}".format(f1_score(y_test, y_pred)))
        print(confusion_matrix(y_test, y_pred))

#best for dev: l2-0.1

penalty: l1; C: 0.01
accuracy: 0.832564271589
f1: 0.234939759036
[[1224    9]
 [ 245   39]]
penalty: l1; C: 0.1
accuracy: 0.841133816744
f1: 0.357333333333
[[1209   24]
 [ 217   67]]
penalty: l1; C: 1
accuracy: 0.840474620962
f1: 0.356382978723
[[1208   25]
 [ 217   67]]
penalty: l1; C: 10
accuracy: 0.841793012525
f1: 0.378238341969
[[1204   29]
 [ 211   73]]
penalty: l1; C: 100
accuracy: 0.841133816744
f1: 0.370757180157
[[1205   28]
 [ 213   71]]
penalty: l1; C: 1000
accuracy: 0.841133816744
f1: 0.370757180157
[[1205   28]
 [ 213   71]]
penalty: l2; C: 0.01
accuracy: 0.843770599868
f1: 0.339832869081
[[1219   14]
 [ 223   61]]
penalty: l2; C: 0.1
accuracy: 0.841133816744
f1: 0.370757180157
[[1205   28]
 [ 213   71]]
penalty: l2; C: 1
accuracy: 0.839815425181
f1: 0.362204724409
[[1205   28]
 [ 215   69]]
penalty: l2; C: 10
accuracy: 0.839815425181
f1: 0.365535248042
[[1204   29]
 [ 214   70]]
penalty: l2; C: 100
accuracy: 0.839815425181
f1: 0.365535248042
[[1204   29]
 [ 214   70]]
pe

In [34]:
import itertools
# RF
estimators = [100, 150, 200, 300, 400]
criterions = ["gini", "entropy"]
depths = [8, 11, 16, 20]
splits = [2, 4, 6, 8]
weights = [None, "balanced", "balanced_subsample"]
para_list = [estimators, criterions, depths, splits, weights]
paras = list(itertools.product(*para_list))

for para in paras:
    rf = RandomForestClassifier(n_estimators=para[0], 
                                    criterion=para[1], 
                                    max_depth=para[2],
                                    min_samples_split = para[3],
                                    class_weight=para[4],
                                    random_state=73514)
    rf.fit(X_comb_scaledsim_train, y_train)
    para_string = "-".join([str(p) for p in para])
    #dev set
    y_dev_pred = rf.predict(X_comb_scaledsim_dev)
    y_dev_prob = rf.predict_proba(X_comb_scaledsim_dev)
    dev_path = "../myclassify/test_res/RFSIM-dev/RFSIM-" + para_string + ".txt"
    predict_for_test(y_dev, y_dev_pred, y_dev_prob, dev_path)

    #test set
    y_pred = rf.predict(X_comb_scaledsim_test)
    y_prob = rf.predict_proba(X_comb_scaledsim_test)
    test_path = "../myclassify/test_res/RFSIM-test/RFSIM-" + para_string + ".txt"
    predict_for_test(y_test, y_pred, y_prob, test_path)

    #test result
    print para_string
    print("accuracy: {}".format(accuracy_score(y_test, y_pred)))
    print("f1: {}".format(f1_score(y_test, y_pred)))
    print(confusion_matrix(y_test, y_pred))
    # predict_for_test(y_test, y_pred, rf.predict_proba(X_test), "rf.txt")
    
#best MAP/MRR for dev when 300-entropy-16-8-balanced_subsample

100-gini-8-2-None
accuracy: 0.82201713909
f1: 0.105960264901
[[1231    2]
 [ 268   16]]
100-gini-8-2-balanced
accuracy: 0.814106789717
f1: 0.494623655914
[[1097  136]
 [ 146  138]]
100-gini-8-2-balanced_subsample
accuracy: 0.817402768622
f1: 0.509734513274
[[1096  137]
 [ 140  144]]
100-gini-8-4-None
accuracy: 0.82201713909
f1: 0.105960264901
[[1231    2]
 [ 268   16]]
100-gini-8-4-balanced
accuracy: 0.812788398154
f1: 0.506944444444
[[1087  146]
 [ 138  146]]
100-gini-8-4-balanced_subsample
accuracy: 0.818721160185
f1: 0.516695957821
[[1095  138]
 [ 137  147]]
100-gini-8-6-None
accuracy: 0.821357943309
f1: 0.117263843648
[[1228    5]
 [ 266   18]]
100-gini-8-6-balanced
accuracy: 0.820039551747
f1: 0.521891418564
[[1095  138]
 [ 135  149]]
100-gini-8-6-balanced_subsample
accuracy: 0.826631509558
f1: 0.536155202822
[[1102  131]
 [ 132  152]]
100-gini-8-8-None
accuracy: 0.821357943309
f1: 0.0996677740864
[[1231    2]
 [ 269   15]]
100-gini-8-8-balanced
accuracy: 0.820698747528
f1: 0.5211

In [35]:
from sklearn.ensemble import GradientBoostingClassifier

# Probably don't try this.
# GBM
paras1 = [50, 100, 400, 800]
paras2 = [2, 3, 6, 12, 24]
for para1 in paras1:
    for para2 in paras2:    
        gbm = GradientBoostingClassifier(n_estimators=para1, max_depth=para2, random_state=47156)
        gbm.fit(X_comb_scaledsim_train, y_train)
        
        #dev set
        y_dev_pred = gbm.predict(X_comb_scaledsim_dev)
        y_dev_prob = gbm.predict_proba(X_comb_scaledsim_dev)
        dev_path = "../myclassify/test_res/GBSIM-dev/GBSIM-" + str(para1) + "-" + str(para2) + ".txt"
        predict_for_test(y_dev, y_dev_pred, y_dev_prob, dev_path)

        #test set
        y_pred = gbm.predict(X_comb_scaledsim_test)
        y_prob = gbm.predict_proba(X_comb_scaledsim_test)
        test_path = "../myclassify/test_res/GBSIM-test/GBSIM-" + str(para1) + "-" + str(para2) + ".txt"
        predict_for_test(y_test, y_pred, y_prob, test_path)
        
        #test result
        print("n_estimators: " + str(para1) + "; max_depth: " + str(para2))
        print("accuracy: {}".format(accuracy_score(y_test, y_pred)))
        print("f1: {}".format(f1_score(y_test, y_pred)))
        print(confusion_matrix(y_test, y_pred))

#best MAP/MRR for dev when para1 = 50, para2 = 2

n_estimators: 50; max_depth: 2
accuracy: 0.8391562294
f1: 0.371134020619
[[1201   32]
 [ 212   72]]
n_estimators: 50; max_depth: 3
accuracy: 0.838497033619
f1: 0.406779661017
[[1188   45]
 [ 200   84]]
n_estimators: 50; max_depth: 6
accuracy: 0.803559657218
f1: 0.365957446809
[[1133  100]
 [ 198   86]]
n_estimators: 50; max_depth: 12
accuracy: 0.802900461437
f1: 0.380952380952
[[1126  107]
 [ 192   92]]
n_estimators: 50; max_depth: 24
accuracy: 0.78707976269
f1: 0.396261682243
[[1088  145]
 [ 178  106]]
n_estimators: 100; max_depth: 2
accuracy: 0.8391562294
f1: 0.383838383838
[[1197   36]
 [ 208   76]]
n_estimators: 100; max_depth: 3
accuracy: 0.840474620962
f1: 0.418269230769
[[1188   45]
 [ 197   87]]
n_estimators: 100; max_depth: 6
accuracy: 0.81015161503
f1: 0.392405063291
[[1136   97]
 [ 191   93]]
n_estimators: 100; max_depth: 12
accuracy: 0.813447593935
f1: 0.388768898488
[[1144   89]
 [ 194   90]]
n_estimators: 100; max_depth: 24
accuracy: 0.792353328939
f1: 0.411214953271
[[10

In [36]:
from sklearn.naive_bayes import BernoulliNB, GaussianNB

# nb = GaussianNB()
paras = [0.01, 0.1, 1, 2, 10, 50, 100]
for para in paras:
    nb = BernoulliNB(alpha=para)
    nb.fit(X_comb_scaledsim_train, y_train)

    #dev set
    y_dev_pred = nb.predict(X_comb_scaledsim_dev)
    y_dev_prob = nb.predict_proba(X_comb_scaledsim_dev)
    dev_path = "../myclassify/test_res/NBSIM-dev/NBSIM-" + str(para) + ".txt"
    predict_for_test(y_dev, y_dev_pred, y_dev_prob, dev_path)

    #test set
    y_pred = nb.predict(X_comb_scaledsim_test)
    y_prob = nb.predict_proba(X_comb_scaledsim_test)
    test_path = "../myclassify/test_res/NBSIM-test/NBSIM-" + str(para) + ".txt"
    predict_for_test(y_test, y_pred, y_prob, test_path)

    #test result
    print("alpha: " + str(para))
    print("accuracy: {}".format(accuracy_score(y_test, y_pred)))
    print("f1: {}".format(f1_score(y_test, y_pred)))
    print(confusion_matrix(y_test, y_pred))
    
#best MAP/MRR for dev when para = 1

alpha: 0.01
accuracy: 0.751483190508
f1: 0.374792703151
[[1027  206]
 [ 171  113]]
alpha: 0.1
accuracy: 0.751483190508
f1: 0.374792703151
[[1027  206]
 [ 171  113]]
alpha: 1
accuracy: 0.754779169413
f1: 0.375838926174
[[1033  200]
 [ 172  112]]
alpha: 2
accuracy: 0.750823994726
f1: 0.361486486486
[[1032  201]
 [ 177  107]]
alpha: 10
accuracy: 0.776532630191
f1: 0.346820809249
[[1088  145]
 [ 194   90]]
alpha: 50
accuracy: 0.806855636124
f1: 0.169971671388
[[1194   39]
 [ 254   30]]
alpha: 100
accuracy: 0.811470006592
f1: 0.0833333333333
[[1218   15]
 [ 271   13]]


## Test Rank Propagation
---

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# For testing

# Set this to one of the trained classifiers above (after training)
clf = nb

# This file contains entries indicating whether answer has a entity matching type required by answer prediction
# Line corresponding to entry has 1 if there is a match, else 0
qn_match_filepath = "../data/QuestionType/test_answer_type_match.txt"

# This file contains the cosine *distance* between question and answer
qn_simweights_filepath = "../data/features/glove_embedding_sentence_similarities_test_300.txt"

raw_scores = clf.predict_proba(X_test)[:, 1]
raw_full_scores = clf.predict_proba(X_test)
y_pred = clf.predict(X_test)

# These are used with best model
# raw_scores = clf.predict_proba(X_comb_scaledsim_test)[:, 1]
# raw_full_scores = clf.predict_proba(X_comb_scaledsim_test)
# y_pred = clf.predict(X_comb_scaledsim_test)

# I think we have to keep the number of neighbors in the graph small because there are only a few
# positive examples. We don't want to link them to too many negative ones.
# This works well with 300 dim glove vector
# scores = rank_propagation(test_file, qn_match_filepath, qn_simweights_filepath, 
#                           raw_scores, alpha=1, sigma=1.0, n_neighbors=3, gamma=1.5)

# L2 loss for | r - y | gives 0.6377 and 0.755 
# scores = rank_propagation(test_file, qn_match_filepath, qn_simweights_filepath, 
#                           raw_scores, alpha=0.5, sigma=1, n_neighbors=3, gamma=1.5,
#                           loss_type=1,
#                           pair_similarity_type=2)

# MRR = 0.7474 vs 0.7399, MAP = 0.6462, 0.6272 for pair_similarity_type=2, sigma=1.0
# Generated with LR trained with C = 100, max_iter = 10000
# map            all 0.6448
# recip_rank     all 0.7537
# scores = rank_propagation(test_file, qn_match_filepath, qn_simweights_filepath, 
#                           raw_scores, alpha=0.5, sigma=2.0, n_neighbors=3, gamma=1.5,
#                           loss_type=1, 
#                           pair_similarity_type=1)

# This with a LR trained with C = 2 gives MRR = 0.7674 MAP = 0.6518
# For this to work, we have to use X_comb_scaledsim_test
# scores = rank_propagation(test_file, qn_match_filepath, qn_simweights_filepath, 
#                           raw_scores, alpha=2.0, sigma=1.0, n_neighbors=5, gamma=1.0,
#                           loss_type=1, 
#                           pair_similarity_type=2)

scores = rank_propagation(test_file, qn_match_filepath, qn_simweights_filepath, 
                          raw_scores, alpha=1.0, sigma=1.0, n_neighbors=3, gamma=1.5,
                          loss_type=1, 
                          pair_similarity_type=2)

y_pred_adjusted = (scores >= 0.5)

print("Adjusted accuracy: {}".format(accuracy_score(y_test, scores >= 0.5)))
print("Original accuracy: {}".format(accuracy_score(y_test, y_pred)))
print("Adjusted f1: {}".format(f1_score(y_test, scores >= 0.5)))
print("Original f1: {}".format(f1_score(y_test, y_pred)))
print(np.sum(np.abs(raw_scores - scores)) / len(scores))  # Average difference between actual
print(np.max(np.abs(raw_scores - scores)))
print(np.sum(y_test != (scores >= 0.5)))
print(np.sum(y_test != (scores >= 0.5)) / float(len(scores)))
print("Number of disagreement: {}".format(np.sum(np.abs(y_pred - y_pred_adjusted))))
print("Adjusted")
print(classification_report(y_test, y_pred_adjusted, digits=4))
print("Original")
print(classification_report(y_test, y_pred, digits=4))

In [None]:
# Converts to weka format
P = np.hstack(((1 - scores).reshape(-1, 1), scores.reshape(-1, 1)))
predict_for_test(y_test, y_pred_adjusted, P, "nb_type_term_adjusted.txt")
predict_for_test(y_test, y_pred, raw_full_scores, "nb_no_adjust.txt")

LR with C = 1, max_iter = 10000
scores = rank_propagation(test_file, qn_match_filepath, qn_simweights_filepath, 
                          raw_scores, alpha=0.5, sigma=1, n_neighbors=3, gamma=1.5,
                          loss_type=1,
                          pair_similarity_type=2)

Adjusted
             precision    recall  f1-score   support

          0       0.85      0.98      0.91      1233
          1       0.74      0.26      0.38       284

avg / total       0.83      0.84      0.81      1517

Original
             precision    recall  f1-score   support

          0       0.83      0.98      0.90      1233
          1       0.69      0.15      0.24       284

avg / total       0.81      0.83      0.78      1517

MRR = 0.7474 vs 0.7399, MAP = 0.6462, 0.6272
LR trained with C = 100, max_iter = 10000

scores = rank_propagation(test_file, qn_match_filepath, qn_simweights_filepath, 
                          raw_scores, alpha=0.5, sigma=1, n_neighbors=3, gamma=1.5,
                          loss_type=1, 
                          pair_similarity_type=2)
                          
Adjusted
             precision    recall  f1-score   support

          0       0.85      0.98      0.91      1233
          1       0.73      0.24      0.36       284

avg / total       0.83      0.84      0.81      1517

Original
             precision    recall  f1-score   support

          0       0.83      0.98      0.90      1233
          1       0.68      0.14      0.24       284

avg / total       0.81      0.83      0.78      1517
                          

---

In [None]:
# For training
train_raw_scores = clf.predict_proba(X_train)[:, 1]
scores = rank_propagation(train_file, train_raw_scores, alpha=2, sigma=2, n_neighbors=11)

print(accuracy_score(y_train, scores >= 0.5))
print(np.sum(np.abs(train_raw_scores - scores)) / len(scores))  # Average difference between actual

In [None]:
accuracy_score(y_train, clf.predict(X_train))