In [1]:
import numpy as np
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler

import cvxpy

## Output in Weka format

In [2]:
import os
def predict_for_test(test, predict, probability, path):
    with open(path, 'w') as f:
        f.write("=== Predictions on test data ===\n")
        f.write(" inst#     actual  predicted error prediction\n")
        for i in range(len(test)):
            string = [str(i + 1)]
            if test[i] == 1:
                string.append("1:positive")
            else:
                string.append("2:negative")
            if predict[i] == 1:
                string.append("1:positive")
            else:
                string.append("2:negative")
            if test[i] == predict[i]:
                string.append(" " * 5)
            else:
                string.append(" " * 2 + "+" + " " * 2)
            if predict[i] == 1:
                string.append(str(probability[i][1]))
            else:
                string.append(str(probability[i][0]))
            string = " ".join(string) + "\n"
            f.write(string)   
    

## Load data

In [3]:
import arff

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

#import data
def importData(path):
#     dataset = arff.load(open(path, 'rb'))
    dataset = arff.load(open(path))    
    data = np.array(dataset['data'])
    #print data[:10]

    #extract features and labels
    features = []
    labels = []
    for d in data:
        f = []
        for i in range(len(d) - 1):
            num = float(d[i])
            if int(num) == num:
                num = int(num)
            f.append(num)
        features.append(f)

        if d[-1] == "positive":
            labels.append(1)
        else:
            labels.append(0)
    return np.asarray(features), np.asarray(labels)

---
## Word embedding vectors

In [84]:
# Map each word to an index
ndim = 300
glove_path = "../data/glove_embeddings/glove.6B.{}d.txt".format(ndim)
with open(glove_path) as lines:
    w2idx = {line.split()[0].decode("utf-8"): i for i, line in enumerate(lines)}

AttributeError: 'str' object has no attribute 'decode'

In [4]:
# Map each word to an index
ndim = 300
glove_path = "../data/glove_embeddings/glove.6B.{}d.txt".format(ndim)
with open(glove_path) as lines:
    w2idx = {line.split()[0]: i for i, line in enumerate(lines)}

In [5]:
vectors = np.empty((len(w2idx), ndim), dtype=np.float)
with open(glove_path) as lines:
    for i, line in enumerate(lines):
        vectors[i] = np.asarray(list(map(float, line.split()[1:])))

In [6]:
import string
from nltk.corpus import stopwords

words_to_exclude = frozenset(string.punctuation) | frozenset(["..", "..."])
words_to_exclude |= frozenset(stopwords.words("english"))

---

## Rank propagation

---
Get vector indicating which example belongs to which question group


In [117]:
def get_QA_group_count(infile):
    """Returns number of answers for next question in file."""
    
    line = infile.readline().strip()
    if line == "":
        return None
    
    if not line.startswith("<QApairs"):
        raise Exception("Invalid data format: {}<-----".format(line))
    
    sentence_count = 0
    while not line.strip().startswith("</QApairs"):
        line = infile.readline().replace('\t', ' ')                    
        if line.strip().lower().startswith("<positive") or line.strip().lower().startswith("<negative"):
            sentence_count += 1
    
    return sentence_count

def get_QA_group_indicators(filepath):    
    with open(filepath) as infile:
        indicators = []
        qn_number = 0
        while infile:
            count = get_QA_group_count(infile)

            # Check for EOF
            if count == None:
                break

            if count <= 0:
                print("Question {} has no answer".format(qn_number))
            
            if count > 0:                
                indicators += ([qn_number] * count)
                qn_number += 1
        
    return np.asarray(indicators)

# Get a question and its candidate answers
def get_QA_group(infile):
    question = []
    answers = []
    line = infile.readline().strip()
    if line == "":
        return None
    
    if not line.startswith("<QApairs"):
        raise Exception("Invalid data format: {}<-----".format(line))
        
    while not line.strip().startswith("</QApairs"):
        line = infile.readline().replace('\t', ' ')
        if line.strip().lower().startswith("<question"):
            line = infile.readline().replace('\t', ' ')
            question.append(line.strip())
        elif line.strip().lower().startswith("<positive"):
            line = infile.readline().replace('\t', ' ')
            answers.append(("positive", line.strip()))
        elif line.strip().lower().startswith("<negative"):
            line = infile.readline().replace('\t', ' ')
            answers.append(("negative", line.strip()))
    
    return {"question": question, "answers": answers}                

from nltk.tokenize import WhitespaceTokenizer

def extract_vector(sentence, exclude, w2idx, wordvectors):
    """Compute the vector for a sentence by averaging the words in the sentence that has word embeddings"""
    # Tokenize sentence
    splitter = WhitespaceTokenizer()
    tokens = splitter.tokenize(sentence)    
    # Remove stopwords and punctuation
    words = [t.lower() for t in tokens if t.lower() not in exclude ]
    
    # If we cannot find any words, we can consider returning a vector of 0
    # and set the resulting cosine similarity to 0 otherwise will result in nan
    # because cosine similarity will divide by 0.
    assert(len(words) > 0)
            
    # Average words in sentence that are in word matrix
    try:
        avg_vec = np.mean([wordvectors[w2idx[w]] for w in words if w in w2idx ] 
                                                 or [np.zeros(wordvectors.shape[1])], 
                           axis=0)
        if not np.any(avg_vec):
            print("Tokens cannot be found: {}".format(words))
        assert(np.any(avg_vec))
        return avg_vec
    except UnicodeDecodeError:
        print(line.strip())
        raise

def compute_pairwise_distance_matrix(X, k, p=2):
    """Compute pairwise distances between each point in X
    and its k-nearest neighbors."""

    from scipy.spatial import KDTree
    kdtree = KDTree(X)
    A = np.zeros((X.shape[0], X.shape[0]), dtype=np.float)
    for i, x in enumerate(X):
        distances, idxs = kdtree.query(x, k+1, p)  # k+1 as one pt is the pt itself.
        for d, j in zip(distances, idxs):
            A[i, j] = d

    # p = 2 corresponds to gaussian kernel. p = 1 corresponds to Laplacian kernel.
    if p == 2:  # Store squared euclidean for L2 distance otherwise if p = 1 just store absolute dist.
        A = A ** 2

            
    return A

Compute weight matrix (i.e., the Graph Laplacian) $L$ for each set of question and its candidate answers

In [8]:
import scipy as sp

# Compute the scores/features for a dataset
def get_weight_matrix(input_file, n_neighbors=5, sigma=1.0, eps=0.0001, p=2):
    """Compute weight matrix for question and answer sentences 
    """
    with open(input_file) as infile:
        num_questions = 0
        while infile:
            group = get_QA_group(infile)

            # Check for EOF
            if group is None:
                break
                
            # Extract question vector
            question = group["question"]
            qvec = extract_vector(question[0], words_to_exclude, w2idx, vectors)

            scores = []
            answer_vectors = []
            for (label, sentence) in group["answers"]:
                # Compute similarity with question vector
                vec = extract_vector(sentence, words_to_exclude, w2idx, vectors) # TODO: Pass these in as args
                answer_vectors.append(vec)
                cosine_distance = sp.spatial.distance.cosine(qvec, vec)
                scores.append((label, cosine_distance))

            # Compute pairwise distances between the answer vectors for K nearest neighbor
            k = min(n_neighbors, len(answer_vectors) - 1) # Minus 1 because have to exclude itself
            # Not enough to do rank propagation. Just keep original scores.
            if k < 0:
                yield None, None
            elif k == 0:
                yield 1, None
            else:                
                answer_vectors = np.vstack(answer_vectors)
                W = compute_pairwise_distance_matrix(answer_vectors, k, p)
                W = np.maximum(W, W.T)  # Ensure W symmetric.
                W[W > 0] = np.exp(- W[W > 0] / (2 * sigma**2))  # Apply gaussian kernel
                D = np.diag(np.sum(W, axis=1))  # Row sum of W
                L = D - W
#                 L = L + eps * np.eye(len(answer_vectors))  # Improve the condition of the graph laplacian                
                Dinvsqrt = np.sqrt(np.linalg.pinv(D))                
                # Need to ensure that Dinvsqrt does not have NAN due to division by 0
                assert(not np.any(np.isnan(Dinvsqrt)))                
                L = Dinvsqrt.dot(L).dot(Dinvsqrt)  # Normalized graph laplacian
                
#                 assert(is_pos_def(Dinvsqrt))
#                 assert(is_pos_def(L))
                
                yield L.shape[0], L
            
            num_questions += 1

---
Load question answer similarity values from file (probably should compute it here).

**NOTE**: Similarity is the wrong term to use here. The values in the file are actually cosine **distances**.
To convert it to similarity, we need to subtract it from 1.


In [9]:
def load_similarity_features(filepath):
    features = []
    labels = []
    map_label = {"positive": 1, "negative": 0}
    with open(filepath) as infile:
        for line in infile:
            label, score = line.strip().split(',')
            score = float(score)
            label = map_label[label]
            features.append(score)
            labels.append(label)
            
    return np.asarray(features).reshape(-1, 1), np.asarray(labels)

In [10]:
#import training data and test data
train_datapath = "../myclassify/qa.train.arff"
test_datapath = "../myclassify/qa.test.arff"
dev_datapath = "../myclassify/qa.dev.arff"

X_train, y_train = importData(train_datapath)
X_test, y_test = importData(test_datapath)
X_dev, y_dev = importData(dev_datapath)

In [74]:
train_file = "../data/answerSelectionExperiments/data/train-less-than-40.xml"
dev_file = "../data/answerSelectionExperiments/data/dev-less-than-40.xml"
test_file = "../data/answerSelectionExperiments/data/test-less-than-40.xml"
dev_qn_group_indicators = get_QA_group_indicators(dev_file)
qn_group_indicators = get_QA_group_indicators(test_file)

AssertionError: Question has no answer

In [103]:
# hidden orig features
X_sim_train, y_sim_train = load_similarity_features("../data/features/glove_embedding_sentence_similarities_train_300.txt")
X_sim_dev, y_sim_dev = load_similarity_features("../data/features/glove_embedding_sentence_similarities_dev_300.txt")
X_sim_test, y_sim_test = load_similarity_features("../data/features/glove_embedding_sentence_similarities_test_300.txt")

X_combined_train = np.hstack((X_train, X_sim_train))
X_combined_dev = np.hstack((X_dev, X_sim_dev))
X_combined_test = np.hstack((X_test, X_sim_test))

# Scale combined data
scaler = RobustScaler()
scaler.fit(X_combined_train)
X_comb_scaled_train = scaler.transform(X_combined_train)
X_comb_scaled_dev = scaler.transform(X_combined_dev)
X_comb_scaled_test = scaler.transform(X_combined_test)

# Only normalize the similarity scores
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaler.fit(X_sim_train)
X_sim_train = scaler.transform(X_sim_train)
X_sim_dev = scaler.transform(X_sim_dev)
X_sim_test = scaler.transform(X_sim_test)
X_comb_scaledsim_train = np.hstack((X_train, X_sim_train))
X_comb_scaledsim_dev = np.hstack((X_dev, X_sim_dev))
X_comb_scaledsim_test = np.hstack((X_test, X_sim_test))

In [12]:
X_sim_train, y_sim_train = load_similarity_features("../data/features/glove_embedding_sentence_similarities_train_300.txt")
X_sim_dev, y_sim_dev = load_similarity_features("../data/features/glove_embedding_sentence_similarities_dev_300.txt")
X_sim_test, y_sim_test = load_similarity_features("../data/features/glove_embedding_sentence_similarities_test_300.txt")

X_combined_train = np.hstack((X_train, X_sim_train))
X_combined_dev = np.hstack((X_dev, X_sim_dev))
X_combined_test = np.hstack((X_test, X_sim_test))

# Scale combined data
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()

scaler.fit(X_combined_train)
X_comb_scaled_train = scaler.transform(X_combined_train)
X_comb_scaled_dev = scaler.transform(X_combined_dev)
X_comb_scaled_test = scaler.transform(X_combined_test)

# Only normalize the similarity scores
scaler.fit(X_sim_train)
X_sim_train = scaler.transform(X_sim_train)
X_sim_dev = scaler.transform(X_sim_dev)
X_sim_test = scaler.transform(X_sim_test)
X_comb_scaledsim_train = np.hstack((X_train, X_sim_train))
X_comb_scaledsim_dev = np.hstack((X_dev, X_sim_dev))
X_comb_scaledsim_test = np.hstack((X_test, X_sim_test))

---
## Propagate rank score


In [13]:
from cvxpy import Variable, Minimize, norm, quad_form, Problem

def is_pos_def(x):
    """Check if a matrix is positive definite. For debugging purposes."""
    return np.all(np.linalg.eigvals(x) > 0)

# Version 2 that uses the ans_type_match and question answer similarity weights
def propagate_scores(r, L, ans_type_match, ans_sim_weights, alpha=1.0, gamma=1.0, loss_type=1, beta=1.0):
    """Solve convex optimization problem to get new scores"""
        
    n = r.size
    y = Variable(n)    
    assert(len(ans_type_match) == n and len(ans_sim_weights) == n)
    
    # If no type match we just ignore the type term
    if not np.any(ans_type_match):
        objective = Minimize( beta * norm(r - y, loss_type) + alpha * quad_form(y, L) )
    else:
        print("Answer indexes with type match: {}".format([i for (i, match) in enumerate(ans_type_match)
                                                           if match == 1]))
        type_term = sum( ans_sim_weights[i] * cvxpy.abs(1 - y[i]) 
                        for i, match in enumerate(ans_type_match) if match == 1 )
        objective = Minimize( beta * norm(r - y, loss_type) + alpha * quad_form(y, L) + gamma * type_term)
            
    constraints = [0 <= y, y <= 1]
    prob = Problem(objective, constraints)    

    # The optimal objective is returned by prob.solve().
    result = prob.solve(verbose=False)      
    assert(prob.status == "optimal")
    
    return y.value.flatten().tolist()[0]

def get_qn_answer_match_indicators(filepath):
    with open(filepath) as infile:
        return np.asarray([int(x.strip()) for x in infile])
    
def get_qn_answer_sim_weights(filepath):
    with open(filepath) as infile:
        return np.asarray([float(x.split(',')[1]) for x in infile])

def rank_propagation(data_filepath, qn_match_filepath, qn_simweights_filepath, r, 
                     alpha=1.0, sigma=1.0, n_neighbors=5, gamma=1.0, 
                     loss_type=1, 
                     pair_similarity_type=2, beta=1):
    total_count = 0

    # Get qn group indicator
    qn_group_indicators = get_QA_group_indicators(data_filepath)
    
    # Get qn answer type match
    qn_ans_type_match = get_qn_answer_match_indicators(qn_match_filepath)
    
    # Get qn answer similarity weights
    # The weights are actually distances so we subtract them from 1 to convert distance to similarity
    qn_ans_sim_weights = 1 - get_qn_answer_sim_weights(qn_simweights_filepath)
    
    qn_number = 0  # Current question number (NOTE: This is not ID in XML)

    scores = []  # To store the final refined scores
    # L is actually the graph Laplacian matrix
    for (count, L) in get_weight_matrix(data_filepath, 
                                        n_neighbors, sigma, 
                                        p=pair_similarity_type):
        # Skip question without candidate answers
        if count is None:
            continue

        print("Processing question {}".format(qn_number))
            
        # Not enough points to propagate. Just use original value.
        MIN_NUM_CANDIDATES = 1
        if count <= MIN_NUM_CANDIDATES:
            assert(r[qn_group_indicators == qn_number].size == count)
            scores += r[qn_group_indicators == qn_number].tolist()
        else:
            # Get indicator vector for which answer has matching type
            ans_type_match = qn_ans_type_match[qn_group_indicators == qn_number]
            
            # Get question / answer similarity weights
            ans_sim_weights = qn_ans_sim_weights[qn_group_indicators == qn_number]                        
            
            # Propagate and append new scores
            assert(r[qn_group_indicators == qn_number].size == L.shape[0])
            new_scores = propagate_scores(r[qn_group_indicators == qn_number],
                                          L, 
                                          ans_type_match, ans_sim_weights,
                                          alpha, gamma, loss_type,
                                          beta=beta # Ignore data term
                                         )                
            scores += new_scores

        qn_number += 1
        total_count += count

    return np.asarray(scores)

## Train basic classifier to give input ranks for Rank Propagation

In [14]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, average_precision_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

In [115]:
lr = LogisticRegression(C=0.75, penalty="l2", class_weight="balanced")

Xtr = X_combined_train
Xdev = X_combined_dev
Xtest = X_combined_test

lr.fit(Xtr, y_train)
y_pred = lr.predict(Xdev)

# lr.fit(X_comb_scaledsim_train, y_train)
# y_pred = lr.predict(X_comb_scaledsim_dev)

# lr.fit(X_train, y_train)
# y_pred = lr.predict(X_dev)

f1_score(y_dev, y_pred)

0.49594813614262567

In [128]:
clf = lr

# This file contains entries indicating whether answer has a entity matching type required by answer prediction
# Line corresponding to entry has 1 if there is a match, else 0
qn_match_filepath = "../data/QuestionType/dev_answer_type_match.txt"

# This file contains the cosine *distance* between question and answer
qn_simweights_filepath = "../data/features/glove_embedding_sentence_similarities_dev_300.txt"

raw_scores = clf.predict_proba(Xdev)[:, 1]
raw_full_scores = clf.predict_proba(Xdev)
y_pred = clf.predict(Xdev)

scores = rank_propagation(dev_file, qn_match_filepath, qn_simweights_filepath, 
#                           raw_scores, alpha=1, sigma=1.0, n_neighbors=3, gamma=1.,
#                           raw_scores, alpha=2, sigma=1, n_neighbors=3, gamma=1.5,
#                           raw_scores, alpha=0.5, sigma=0.5, n_neighbors=3, gamma=1.5,  # Used for generating examples
#                           raw_scores, alpha=0.07, sigma=0.5, n_neighbors=3, gamma=1.75,
                          raw_scores, alpha=0.1, sigma=0.5, n_neighbors=3, gamma=1.75,                          
                          loss_type=1, 
                          pair_similarity_type=2,
                          beta=1)



# f1_score(y_dev, (scores >= 0.5))
average_precision_score(y_dev, raw_scores), average_precision_score(y_dev, scores)

Question 36 has no answer
Processing question 0
Processing question 1
Processing question 2
Answer indexes with type match: [11, 16, 31, 39, 45, 54, 56, 58]
Processing question 3
Answer indexes with type match: [0, 1, 2, 3, 5, 7, 9, 10]
Processing question 4
Answer indexes with type match: [0, 3]
Processing question 5
Processing question 6
Processing question 7
Processing question 8
Processing question 9
Processing question 10
Processing question 11
Processing question 12
Answer indexes with type match: [0, 1]
Processing question 13
Answer indexes with type match: [0, 1, 2, 3, 4, 5]
Processing question 14
Processing question 15
Answer indexes with type match: [0, 2, 3, 4, 5, 6, 7, 8, 13, 14, 17, 23, 28, 31]
Processing question 16
Processing question 17
Answer indexes with type match: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 20, 21, 23, 25, 33, 34, 37, 40, 45, 49, 50, 52, 60, 63, 67, 68, 69, 71, 72, 73, 79, 80, 82, 83, 85, 86]
Processing question 18
Processing question

(0.4970355010047266, 0.56346281780192553)

In [139]:
clf = lr

# This file contains entries indicating whether answer has a entity matching type required by answer prediction
# Line corresponding to entry has 1 if there is a match, else 0
qn_match_filepath = "../data/QuestionType/test_answer_type_match.txt"

# This file contains the cosine *distance* between question and answer
qn_simweights_filepath = "../data/features/glove_embedding_sentence_similarities_test_300.txt"

raw_scores = clf.predict_proba(Xtest)[:, 1]
raw_full_scores = clf.predict_proba(Xtest)
y_pred = clf.predict(Xtest)

scores = rank_propagation(test_file, qn_match_filepath, qn_simweights_filepath, 
                          raw_scores, alpha=0.05, sigma=1.5, n_neighbors=5, gamma=1.75,
                          loss_type=1, 
                          pair_similarity_type=2)

# f1_score(y_dev, (scores >= 0.5))
average_precision_score(y_test, raw_scores), average_precision_score(y_test, scores)

Processing question 0
Processing question 1
Processing question 2
Processing question 3
Processing question 4
Processing question 5
Processing question 6
Processing question 7
Answer indexes with type match: [0, 1, 2, 3, 4, 5, 6, 7, 11, 18, 23, 33, 35, 36, 43, 44, 46, 48, 50, 52, 54, 61, 63, 66, 79, 83, 87]
Processing question 8
Processing question 9
Processing question 10
Processing question 11
Processing question 12
Answer indexes with type match: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 26, 33, 41]
Processing question 13
Processing question 14
Processing question 15
Processing question 16
Processing question 17
Answer indexes with type match: [0, 1, 4, 5, 6]
Processing question 18
Processing question 19
Answer indexes with type match: [1, 5]
Processing question 20
Processing question 21
Processing question 22
Answer indexes with type match: [0, 1, 2, 3]
Processing question 23
Processing question 24
Processing question 25
Processing question 26
Answer indexes with type 

(0.46721282276590392, 0.47723352403600355)

In [180]:
import math

def average_precision_by_group(y_true, y_pred, qn_group_indicators):
    ap = []    
    group_ids = np.unique(qn_group_indicators)
    for i in group_ids:
        mask = (qn_group_indicators == i)        
        score = average_precision_score(y_true[mask], y_pred[mask])
        ap.append(score)

    return ap

def mean_average_precision(y_true, y_pred, qn_group_indicators):
    ap = average_precision_by_group(y_true, y_pred, qn_group_indicators)
    ap = [x for x in ap if not math.isnan(x)]
    return sum(ap) / float(len(ap))

def reciprocal_rank_by_group(y_true, y_pred, qn_group_indicators):
    rank = []    
    group_ids = np.unique(qn_group_indicators)
    for i in group_ids:
        mask = (qn_group_indicators == i)
        x = y_true[mask]
        y = y_pred[mask]
        idx = np.argsort(-y)
        x = x[idx]
        for r, label in enumerate(x, 1):
            if label == 1:
                rank.append((1.0 / r, i))
                break
    
    assert all(not math.isnan(r[0]) for r in rank)
    return rank

def mean_reciprocal_rank(y_true, y_pred, qn_group_indicators):
    rr = reciprocal_rank_by_group(y_true, y_pred, qn_group_indicators)
    rr = [r[0] for r in rr]
    return sum(rr) / float(len(rr))

In [149]:
orig_aps = average_precision_by_group(y_dev, raw_scores, dev_qn_group_indicators)

  recall = tps / tps[-1]


In [150]:
new_aps = average_precision_by_group(y_dev, scores, dev_qn_group_indicators)

  recall = tps / tps[-1]


In [184]:
map_gain = []
for i, (x, y) in enumerate(zip(orig_aps, new_aps)):
    if math.isnan(x) or math.isnan(y):
        continue
    map_gain.append((y - x, i))

map_gain = sorted(map_gain, reverse=True)

orig_rr = reciprocal_rank_by_group(y_dev, raw_scores, dev_qn_group_indicators)
new_rr = reciprocal_rank_by_group(y_dev, scores, dev_qn_group_indicators)

rr_gain = []
for (x, y) in zip(orig_rr, new_rr):
    assert x[1] == y[1] # Ensure same group
    rr_gain.append((y[0] - x[0], x[1]))

In [189]:
sum(x[0] for x in map_gain) / len(map_gain)

0.015028354526382567

In [174]:
map_gain

[(0.5, 65),
 (0.24353535353535349, 44),
 (0.16666666666666674, 3),
 (0.10909090909090902, 52),
 (0.083075131984711392, 15),
 (0.07575757575757569, 70),
 (0.050000000000000044, 13),
 (0.042385810127745671, 17),
 (0.0, 80),
 (0.0, 78),
 (0.0, 77),
 (0.0, 76),
 (0.0, 75),
 (0.0, 74),
 (0.0, 73),
 (0.0, 72),
 (0.0, 71),
 (0.0, 69),
 (0.0, 68),
 (0.0, 67),
 (0.0, 66),
 (0.0, 64),
 (0.0, 63),
 (0.0, 62),
 (0.0, 61),
 (0.0, 60),
 (0.0, 59),
 (0.0, 58),
 (0.0, 57),
 (0.0, 56),
 (0.0, 55),
 (0.0, 54),
 (0.0, 53),
 (0.0, 51),
 (0.0, 50),
 (0.0, 49),
 (0.0, 48),
 (0.0, 47),
 (0.0, 46),
 (0.0, 45),
 (0.0, 43),
 (0.0, 42),
 (0.0, 41),
 (0.0, 40),
 (0.0, 39),
 (0.0, 38),
 (0.0, 37),
 (0.0, 36),
 (0.0, 34),
 (0.0, 33),
 (0.0, 32),
 (0.0, 31),
 (0.0, 30),
 (0.0, 29),
 (0.0, 28),
 (0.0, 27),
 (0.0, 26),
 (0.0, 25),
 (0.0, 24),
 (0.0, 22),
 (0.0, 21),
 (0.0, 20),
 (0.0, 19),
 (0.0, 18),
 (0.0, 16),
 (0.0, 14),
 (0.0, 12),
 (0.0, 11),
 (0.0, 10),
 (0.0, 9),
 (0.0, 8),
 (0.0, 7),
 (0.0, 6),
 (0.0, 5),
 (0

In [144]:
mean_average_precision(y_dev, scores, dev_qn_group_indicators)

  recall = tps / tps[-1]


0.82146859492546476

In [145]:
mean_average_precision(y_dev, raw_scores, dev_qn_group_indicators)

  recall = tps / tps[-1]


0.80644024039908224

In [181]:
mean_reciprocal_rank(y_dev, scores, dev_qn_group_indicators)

0.8740287490287492

In [182]:
mean_reciprocal_rank(y_dev, raw_scores, dev_qn_group_indicators)

0.8761655011655012

In [186]:
sorted(rr_gain, reverse=True)

[(0.5, 65),
 (0.0, 80),
 (0.0, 78),
 (0.0, 77),
 (0.0, 76),
 (0.0, 75),
 (0.0, 74),
 (0.0, 73),
 (0.0, 72),
 (0.0, 71),
 (0.0, 69),
 (0.0, 68),
 (0.0, 67),
 (0.0, 66),
 (0.0, 64),
 (0.0, 63),
 (0.0, 62),
 (0.0, 61),
 (0.0, 60),
 (0.0, 59),
 (0.0, 58),
 (0.0, 57),
 (0.0, 56),
 (0.0, 55),
 (0.0, 54),
 (0.0, 53),
 (0.0, 52),
 (0.0, 51),
 (0.0, 50),
 (0.0, 49),
 (0.0, 48),
 (0.0, 47),
 (0.0, 46),
 (0.0, 45),
 (0.0, 44),
 (0.0, 43),
 (0.0, 42),
 (0.0, 41),
 (0.0, 40),
 (0.0, 39),
 (0.0, 38),
 (0.0, 37),
 (0.0, 36),
 (0.0, 34),
 (0.0, 33),
 (0.0, 32),
 (0.0, 31),
 (0.0, 30),
 (0.0, 29),
 (0.0, 28),
 (0.0, 27),
 (0.0, 26),
 (0.0, 25),
 (0.0, 24),
 (0.0, 23),
 (0.0, 22),
 (0.0, 21),
 (0.0, 20),
 (0.0, 19),
 (0.0, 18),
 (0.0, 17),
 (0.0, 16),
 (0.0, 15),
 (0.0, 14),
 (0.0, 13),
 (0.0, 12),
 (0.0, 11),
 (0.0, 10),
 (0.0, 9),
 (0.0, 8),
 (0.0, 7),
 (0.0, 6),
 (0.0, 5),
 (0.0, 4),
 (0.0, 3),
 (0.0, 1),
 (-0.16666666666666669, 70),
 (-0.5, 2)]

In [175]:
def sort_by_scores(items, scores):
    items_with_scores = [(s, a) for (s, a) in zip(scores, items)]
    items_with_scores = sorted(items_with_scores, reverse=True)
    return items_with_scores
        
def show_rank(filepath, orig_scores, new_scores, qn_group_indicators, qn_number):
    mask = qn_group_indicators == qn_number
    orig_scores = orig_scores[mask]
    new_scores = new_scores[mask]
    
    with open(filepath) as infile:        
        count = 0
        while count < qn_number + 1:
            qa_group = get_QA_group(infile)
            
            # Skip and don't count questions with no answers
            if not qa_group["answers"]:
                continue
            
            count += 1
            
        if count != qn_number + 1:
            print("Error: File does not have required question. qn_number: {} total qn in file: {}".format(qn_number, count))
            return
    
    question = qa_group["question"]
    answers = qa_group["answers"]
    
    assert len(orig_scores) == len(new_scores)
    assert len(orig_scores) == len(answers)
    
    orig_ranking = sort_by_scores(answers, orig_scores)
    new_ranking = sort_by_scores(answers, new_scores)
    
    symbol = {"negative": "-", "positive": "+"}
    
    print("Q {}: {}".format(qn_number, question))
    print("Original ranking")
    print("\n".join("{:.5f}".format(i[0]) + " " + symbol[i[1][0]] + " " + \
                    i[1][1] for i in orig_ranking))
    print('=' * 20)
    print("New ranking")
    print("\n".join("{:.5f}".format(i[0]) + " " + symbol[i[1][0]] + " " + \
                    i[1][1] for i in new_ranking))
    
    

In [187]:
show_rank(dev_file, raw_scores, scores, dev_qn_group_indicators, 65)

Q 65: ["What was Ice T 's original name ?"]
Original ranking
0.39639 - CoronerRecords gets its name because he thinks traditional record labels will perish in the new digital millennium .
0.33338 + In his new capacity , Ice , a.k.a . Tracy Morrow , last week sat on a panel on `` The Future of Music , '' sponsored by Red Herring magazine .
New ranking
1.00000 + In his new capacity , Ice , a.k.a . Tracy Morrow , last week sat on a panel on `` The Future of Music , '' sponsored by Red Herring magazine .
0.39639 - CoronerRecords gets its name because he thinks traditional record labels will perish in the new digital millennium .


---

In [190]:
# LR
# lr = LogisticRegression(C=3, class_weight="balanced")
# lr = LogisticRegression(C=0.01)
penalties = ["l1", "l2"]
cs = [0.01, 0.1, 1, 10, 100, 1000]

for para1 in penalties:
    for para2 in cs:
        lr = LogisticRegression(penalty = para1, C=para2, max_iter=1e8)
        lr.fit(X_train, y_train)
    
        #dev set
        y_dev_pred = lr.predict(X_dev)
        y_dev_prob = lr.predict_proba(X_dev)
        dev_path = "../myclassify/test_res/LG-dev/LG-" + str(para1) + "-" + str(para2) + ".txt"
        predict_for_test(y_dev, y_dev_pred, y_dev_prob, dev_path)

        #test set
        y_pred = lr.predict(X_test)
        y_prob = lr.predict_proba(X_test)
        test_path = "../myclassify/test_res/LG-test/LG-" + str(para1) + "-" + str(para2) + ".txt"
        predict_for_test(y_test, y_pred, y_prob, test_path)
        
        #test result
        print("penalty: " + str(para1) + "; C: " + str(para2))
        print("accuracy: {}".format(accuracy_score(y_test, y_pred)))
        print("f1: {}".format(f1_score(y_test, y_pred)))
        print(confusion_matrix(y_test, y_pred))

#best MAP/MRR for dev when l1-10

penalty: l1; C: 0.01
accuracy: 0.8299274884640738
f1: 0.20858895705521474
[[1225    8]
 [ 250   34]]
penalty: l1; C: 0.1
accuracy: 0.8325642715886619
f1: 0.27011494252873564
[[1216   17]
 [ 237   47]]
penalty: l1; C: 1
accuracy: 0.8286090969017799
f1: 0.24418604651162787
[[1215   18]
 [ 242   42]]
penalty: l1; C: 10
accuracy: 0.8286090969017799
f1: 0.24418604651162787
[[1215   18]
 [ 242   42]]
penalty: l1; C: 100
accuracy: 0.8292682926829268
f1: 0.24489795918367346
[[1216   17]
 [ 242   42]]
penalty: l1; C: 1000
accuracy: 0.8292682926829268
f1: 0.24489795918367346
[[1216   17]
 [ 242   42]]
penalty: l2; C: 0.01
accuracy: 0.8378378378378378
f1: 0.28488372093023256
[[1222   11]
 [ 235   49]]
penalty: l2; C: 0.1
accuracy: 0.8305866842452209
f1: 0.2636103151862464
[[1214   19]
 [ 238   46]]
penalty: l2; C: 1
accuracy: 0.8279499011206328
f1: 0.2434782608695652
[[1214   19]
 [ 242   42]]
penalty: l2; C: 10
accuracy: 0.8279499011206328
f1: 0.239067055393586
[[1215   18]
 [ 243   41]]
penalty

In [31]:
X_sim_train, y_sim_train = load_similarity_features("../data/features/glove_embedding_sentence_similarities_train_300.txt")
X_sim_dev, y_sim_dev = load_similarity_features("../data/features/glove_embedding_sentence_similarities_dev_300.txt")
X_sim_test, y_sim_test = load_similarity_features("../data/features/glove_embedding_sentence_similarities_test_300.txt")

# Only normalize the similarity scores
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaler.fit(X_sim_train)
X_sim_train = scaler.transform(X_sim_train)
X_sim_dev = scaler.transform(X_sim_dev)
X_sim_test = scaler.transform(X_sim_test)
X_comb_scaledsim_train = np.hstack((X_train, X_sim_train))
X_comb_scaledsim_dev = np.hstack((X_dev, X_sim_dev))
X_comb_scaledsim_test = np.hstack((X_test, X_sim_test))

In [211]:
penalties = ["l1", "l2"]
cs = [0.01, 0.1, 0.5, 1, 10, 100, 1000]

best = (- math.inf, None)
for para1 in penalties:
    for para2 in cs:
        lr = LogisticRegression(penalty = para1, C=para2, max_iter=1e8)
#                                 class_weight="balanced")
        lr.fit(X_comb_scaledsim_train, y_train)
    
        #dev set                
        y_dev_pred = lr.predict(X_comb_scaledsim_dev)
        y_dev_prob = lr.predict_proba(X_comb_scaledsim_dev)        
        dev_path = "../myclassify/test_res/LGSIM-dev/LGSIM-" + str(para1) + "-" + str(para2) + ".txt"
        predict_for_test(y_dev, y_dev_pred, y_dev_prob, dev_path)

        #test set
        y_pred = lr.predict(X_comb_scaledsim_test)
        y_prob = lr.predict_proba(X_comb_scaledsim_test)
        test_path = "../myclassify/test_res/LGSIM-test/LGSIM-" + str(para1) + "-" + str(para2) + ".txt"
        predict_for_test(y_test, y_pred, y_prob, test_path)
        
        #test result
        print('=' * 20)
        print("penalty: " + str(para1) + "; C: " + str(para2))
        print("accuracy: {:.6f}".format(accuracy_score(y_test, y_pred)))
        
        dev_f1 = f1_score(y_dev, y_dev_pred)
        dev_map = mean_average_precision(y_dev, y_dev_prob[:, 1], dev_qn_group_indicators)
        
        print("f1 dev: {:.6f}".format(dev_f1))
        print("f1 test: {:.6f}".format(f1_score(y_test, y_pred)))
                
        print("map dev: {:.6f}".format(dev_map)) 
        print("map test: {:.6f}".format(mean_average_precision(y_test, y_prob[:, 1], qn_group_indicators)))

        best = max(best, (dev_map, (para1, para2)))

print("Best params. map dev: {:.6f} | params: {}".format(best[0], best[1]))
              
              
#best for dev: l2-0.1

penalty: l1; C: 0.01
accuracy: 0.832564
f1 dev: 0.092437
f1 test: 0.234940
map dev: 0.765050
map test: 0.719504
penalty: l1; C: 0.1
accuracy: 0.841134
f1 dev: 0.106996
f1 test: 0.357333
map dev: 0.819513
map test: 0.783981


  recall = tps / tps[-1]


penalty: l1; C: 0.5
accuracy: 0.841134
f1 dev: 0.130081
f1 test: 0.364116
map dev: 0.814908
map test: 0.785233
penalty: l1; C: 1
accuracy: 0.840475
f1 dev: 0.130612
f1 test: 0.356383
map dev: 0.812697
map test: 0.781199
penalty: l1; C: 10
accuracy: 0.841793
f1 dev: 0.129032
f1 test: 0.378238
map dev: 0.810909
map test: 0.778330
penalty: l1; C: 100
accuracy: 0.841134
f1 dev: 0.136546
f1 test: 0.370757
map dev: 0.810585
map test: 0.778709
penalty: l1; C: 1000
accuracy: 0.841134
f1 dev: 0.136546
f1 test: 0.370757
map dev: 0.810585
map test: 0.778709
penalty: l2; C: 0.01
accuracy: 0.843771
f1 dev: 0.106557
f1 test: 0.339833
map dev: 0.812245
map test: 0.754368
penalty: l2; C: 0.1
accuracy: 0.841134
f1 dev: 0.129555
f1 test: 0.370757
map dev: 0.820734
map test: 0.770671
penalty: l2; C: 0.5
accuracy: 0.839815
f1 dev: 0.130612
f1 test: 0.362205
map dev: 0.815179
map test: 0.775397
penalty: l2; C: 1
accuracy: 0.839815
f1 dev: 0.130612
f1 test: 0.362205
map dev: 0.812909
map test: 0.777666
pena

## Test Rank Propagation
---

In [210]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [209]:
#train classifiers using best parameters obtained above, Jacana features only

classifiers = {}

nb = BernoulliNB(alpha=2)
nb.fit(X_train, y_train)
classifiers["NB"] = nb

gbm = GradientBoostingClassifier(n_estimators=100, max_depth=2, random_state=47156)
gbm.fit(X_train, y_train)
classifiers["GB"] = gbm

rf = RandomForestClassifier(n_estimators=150, 
                                criterion="entropy", 
                                max_depth=16,
                                min_samples_split = 8,
                                class_weight="balanced",
                                random_state=73514)
rf.fit(X_train, y_train)
classifiers["RF"] = rf

lr = LogisticRegression(penalty = "l1", C=10, max_iter=1e8)
lr.fit(X_train, y_train)
classifiers["LG"] = lr

#train classifiers using best parameters obtained above, Jacana features with normalize the similarity scores

nbsim = BernoulliNB(alpha=1)
nbsim.fit(X_comb_scaledsim_train, y_train)
classifiers["NBSIM"] = nbsim

gbmsim = GradientBoostingClassifier(n_estimators=50, max_depth=2, random_state=47156)
gbmsim.fit(X_comb_scaledsim_train, y_train)
classifiers["GBSIM"] = gbmsim

rfsim = RandomForestClassifier(n_estimators=300, 
                                criterion="entropy", 
                                max_depth=16,
                                min_samples_split = 8,
                                class_weight="balanced_subsample",
                                random_state=73514)
rfsim.fit(X_comb_scaledsim_train, y_train)
classifiers["RFSIM"] = rfsim

lrsim = LogisticRegression(penalty = "l2", C=0.1, max_iter=1e8)
lrsim.fit(X_comb_scaledsim_train, y_train)

classifiers["LGSIM"] = lrsim


NameError: name 'BernoulliNB' is not defined

In [36]:
# For testing

# This file contains entries indicating whether answer has a entity matching type required by answer prediction
# Line corresponding to entry has 1 if there is a match, else 0
qn_match_filepath = "../data/QuestionType/test_answer_type_match.txt"

#dev
dev_qn_match_filepath = "../data/QuestionType/dev_answer_type_match.txt"

# This file contains the cosine *distance* between question and answer
qn_simweights_filepath = "../data/features/glove_embedding_sentence_similarities_test_300.txt"

#dev
dev_qn_simweights_filepath = "../data/features/glove_embedding_sentence_similarities_dev_300.txt"

import itertools    
for key in classifiers:
    # Set this to one of the trained classifiers above (after training)
    clf = classifiers[key]
    
    if "SIM" not in key:
        dev_raw_scores = clf.predict_proba(X_dev)[:, 1]
        dev_raw_full_scores = clf.predict_proba(X_dev)
        y_dev_pred = clf.predict(X_dev)

        raw_scores = clf.predict_proba(X_test)[:, 1]
        raw_full_scores = clf.predict_proba(X_test)
        y_pred = clf.predict(X_test)
        
    elif "SIM" in key:
        dev_raw_scores = clf.predict_proba(X_comb_scaledsim_dev)[:, 1]
        dev_raw_full_scores = clf.predict_proba(X_comb_scaledsim_dev)
        y_dev_pred = clf.predict(X_comb_scaledsim_dev)

        raw_scores = clf.predict_proba(X_comb_scaledsim_test)[:, 1]
        raw_full_scores = clf.predict_proba(X_comb_scaledsim_test)
        y_pred = clf.predict(X_comb_scaledsim_test)

# These are used with best model
# raw_scores = clf.predict_proba(X_comb_scaledsim_test)[:, 1]
# raw_full_scores = clf.predict_proba(X_comb_scaledsim_test)
# y_pred = clf.predict(X_comb_scaledsim_test)

# I think we have to keep the number of neighbors in the graph small because there are only a few
# positive examples. We don't want to link them to too many negative ones.
# This works well with 300 dim glove vector
# scores = rank_propagation(test_file, qn_match_filepath, qn_simweights_filepath, 
#                           raw_scores, alpha=1, sigma=1.0, n_neighbors=3, gamma=1.5)

# L2 loss for | r - y | gives 0.6377 and 0.755 
# scores = rank_propagation(test_file, qn_match_filepath, qn_simweights_filepath, 
#                           raw_scores, alpha=0.5, sigma=1, n_neighbors=3, gamma=1.5,
#                           loss_type=1,
#                           pair_similarity_type=2)

# MRR = 0.7474 vs 0.7399, MAP = 0.6462, 0.6272 for pair_similarity_type=2, sigma=1.0
# Generated with LR trained with C = 100, max_iter = 10000
# map            all 0.6448
# recip_rank     all 0.7537
# scores = rank_propagation(test_file, qn_match_filepath, qn_simweights_filepath, 
#                           raw_scores, alpha=0.5, sigma=2.0, n_neighbors=3, gamma=1.5,
#                           loss_type=1, 
#                           pair_similarity_type=1)

# This with a LR trained with C = 2 gives MRR = 0.7674 MAP = 0.6518
# For this to work, we have to use X_comb_scaledsim_test
# scores = rank_propagation(test_file, qn_match_filepath, qn_simweights_filepath, 
#                           raw_scores, alpha=2.0, sigma=1.0, n_neighbors=5, gamma=1.0,
#                           loss_type=1, 
#                           pair_similarity_type=2)

    alphas = [0.05, 0.1, 0.5, 1.0, 1.5, 2.0]
    sigmas = [0.5, 0.75, 1.0, 1.5, 2.0]
    neighbors = [3, 5]
    gammas = [0, 0.8, 1.0, 1.25, 1.5, 1.75, 2.0]
    loss_type = [1, 2]
    para_list = [alphas, sigmas, neighbors, gammas, loss_type]
    paras = list(itertools.product(*para_list))
    
    for para in paras:
        para_string = "-".join([str(p) for p in para])
        
        print "************************************************************"
        print "classifier: ", key
        print "rank propagation parameters: ", para_string
        print "************************************************************"

        print "==========================dev================================"
        
        dev_scores = rank_propagation(dev_file, dev_qn_match_filepath, dev_qn_simweights_filepath, 
                                  dev_raw_scores, alpha=para[0], sigma=para[1], n_neighbors=para[2], gamma=para[3],
                                  loss_type=para[4], 
                                  pair_similarity_type=2)

        y_dev_pred_adjusted = (dev_scores >= 0.5)

        print("Adjusted accuracy: {}".format(accuracy_score(y_dev, dev_scores >= 0.5)))
        print("Original accuracy: {}".format(accuracy_score(y_dev, y_dev_pred)))
        print("Adjusted f1: {}".format(f1_score(y_dev, dev_scores >= 0.5)))
        print("Original f1: {}".format(f1_score(y_dev, y_dev_pred)))
        print(np.sum(np.abs(dev_raw_scores - dev_scores)) / len(dev_scores))  # Average difference between actual
        print(np.max(np.abs(dev_raw_scores - dev_scores)))
        print(np.sum(y_test != (dev_scores >= 0.5)))
        print(np.sum(y_test != (dev_scores >= 0.5)) / float(len(dev_scores)))
        print("Number of disagreement: {}".format(np.sum(np.abs(y_dev_pred - y_dev_pred_adjusted))))
        print("Adjusted")
        print(classification_report(y_dev, y_dev_pred_adjusted, digits=4))
        print("Original")
        print(classification_report(y_dev, y_dev_pred, digits=4))
    
        # Convert dev to weka format
        P_dev = np.hstack(((1 - dev_scores).reshape(-1, 1), dev_scores.reshape(-1, 1)))
        dev_path = "../myclassify/test_res/RP" + key + "-dev/" + key + "-" + para_string + ".txt"
        predict_for_test(y_dev, y_dev_pred_adjusted, P_dev, dev_path)
        #predict_for_test(y_test, y_dev_pred, dev_raw_full_scores, "nb_no_adjust.txt")

        print "==========================test================================"
        
        scores = rank_propagation(test_file, qn_match_filepath, qn_simweights_filepath, 
                                  raw_scores, alpha=para[0], sigma=para[1], n_neighbors=para[2], gamma=para[3],
                                  loss_type=para[4], 
                                  pair_similarity_type=2)

        y_pred_adjusted = (scores >= 0.5)
    

        print("Adjusted accuracy: {}".format(accuracy_score(y_test, scores >= 0.5)))
        print("Original accuracy: {}".format(accuracy_score(y_test, y_pred)))
        print("Adjusted f1: {}".format(f1_score(y_test, scores >= 0.5)))
        print("Original f1: {}".format(f1_score(y_test, y_pred)))
        print(np.sum(np.abs(raw_scores - scores)) / len(scores))  # Average difference between actual
        print(np.max(np.abs(raw_scores - scores)))
        print(np.sum(y_test != (scores >= 0.5)))
        print(np.sum(y_test != (scores >= 0.5)) / float(len(scores)))
        print("Number of disagreement: {}".format(np.sum(np.abs(y_pred - y_pred_adjusted))))
        print("Adjusted")
        print(classification_report(y_test, y_pred_adjusted, digits=4))
        print("Original")
        print(classification_report(y_test, y_pred, digits=4))
    
    
        # Convert test to weka format
        P = np.hstack(((1 - scores).reshape(-1, 1), scores.reshape(-1, 1)))
        test_path = "../myclassify/test_res/RP" + key + "-test/" + key + "-" + para_string + ".txt"
        predict_for_test(y_test, y_pred_adjusted, P, test_path)
        #predict_for_test(y_test, y_pred, raw_full_scores, "nb_no_adjust.txt")

************************************************************
classifier:  LGSIM
rank propagation parameters:  0.5-0.5-3-0-1
************************************************************
Adjusted accuracy: 0.812717770035
Original accuracy: 0.812717770035
Adjusted f1: 0.12955465587
Original f1: 0.12955465587
1.22330744298e-12
8.27102553114e-11
1
0.000871080139373
Number of disagreement: 0
Adjusted
             precision    recall  f1-score   support

          0     0.8166    0.9903    0.8951       926
          1     0.6400    0.0721    0.1296       222

avg / total     0.7824    0.8127    0.7470      1148

Original
             precision    recall  f1-score   support

          0     0.8166    0.9903    0.8951       926
          1     0.6400    0.0721    0.1296       222

avg / total     0.7824    0.8127    0.7470      1148





Adjusted accuracy: 0.841133816744
Original accuracy: 0.841133816744
Adjusted f1: 0.370757180157
Original f1: 0.370757180157
1.03539089839e-12
1.2013123829e-10
241
0.158866183256
Number of disagreement: 0
Adjusted
             precision    recall  f1-score   support

          0     0.8498    0.9773    0.9091      1233
          1     0.7172    0.2500    0.3708       284

avg / total     0.8250    0.8411    0.8083      1517

Original
             precision    recall  f1-score   support

          0     0.8498    0.9773    0.9091      1233
          1     0.7172    0.2500    0.3708       284

avg / total     0.8250    0.8411    0.8083      1517

************************************************************
classifier:  LGSIM
rank propagation parameters:  0.5-0.5-3-0-2
************************************************************
Adjusted accuracy: 0.812717770035
Original accuracy: 0.812717770035
Adjusted f1: 0.12955465587
Original f1: 0.12955465587
0.0014137731958
0.106032731491
1
0.000871

LR with C = 1, max_iter = 10000
scores = rank_propagation(test_file, qn_match_filepath, qn_simweights_filepath, 
                          raw_scores, alpha=0.5, sigma=1, n_neighbors=3, gamma=1.5,
                          loss_type=1,
                          pair_similarity_type=2)

Adjusted
             precision    recall  f1-score   support

          0       0.85      0.98      0.91      1233
          1       0.74      0.26      0.38       284

avg / total       0.83      0.84      0.81      1517

Original
             precision    recall  f1-score   support

          0       0.83      0.98      0.90      1233
          1       0.69      0.15      0.24       284

avg / total       0.81      0.83      0.78      1517

MRR = 0.7474 vs 0.7399, MAP = 0.6462, 0.6272
LR trained with C = 100, max_iter = 10000

scores = rank_propagation(test_file, qn_match_filepath, qn_simweights_filepath, 
                          raw_scores, alpha=0.5, sigma=1, n_neighbors=3, gamma=1.5,
                          loss_type=1, 
                          pair_similarity_type=2)
                          
Adjusted
             precision    recall  f1-score   support

          0       0.85      0.98      0.91      1233
          1       0.73      0.24      0.36       284

avg / total       0.83      0.84      0.81      1517

Original
             precision    recall  f1-score   support

          0       0.83      0.98      0.90      1233
          1       0.68      0.14      0.24       284

avg / total       0.81      0.83      0.78      1517
                          

---

In [None]:
# For training
train_raw_scores = clf.predict_proba(X_train)[:, 1]
scores = rank_propagation(train_file, train_raw_scores, alpha=2, sigma=2, n_neighbors=11)

print(accuracy_score(y_train, scores >= 0.5))
print(np.sum(np.abs(train_raw_scores - scores)) / len(scores))  # Average difference between actual

In [None]:
accuracy_score(y_train, clf.predict(X_train))