In [1]:
import os
import numpy
from collections import Counter
from collections import namedtuple
import math
import traceback
import pandas

Triple = namedtuple("Triple", ["src", "dest"])
DATA_DIR="/root/data/allen-ai-challenge/final_ngrams"
OUT_DIR="/root/data/allen-ai-challenge/final_ngrams/scores"

In [2]:
class TripletStorage(object):

    def __init__(self, file=None):
        self.notype = Counter()
        self.tokens = Counter()

        debug_cnt = 0
        self.file = file

        if file is not None:
            for line in open(file):
                try:
                    if len(line.split('\t')) == 3:
                        src, dest, cnt = line.split('\t')
                        
                        triple = Triple(src=src, dest=dest)
                        self.fill_one(triple, int(cnt))

                        if debug_cnt % 1000000 == 0:
                            print "Read %d triplets" % debug_cnt

                        debug_cnt += 1
                        
                except:
                    traceback.print_exc()

    def fill_one(self, triple, cnt):
        if not isinstance(triple, Triple):
            raise ValueError

        self.notype[(triple.src, triple.dest)] += cnt
        self.tokens[triple.src] += cnt
        self.tokens[triple.dest] += cnt
        
def score_norel(storage, src, dest, normalization="none"):
    score = max(storage.notype[(src, dest)], storage.notype[(dest, src)])

    if score == 0:
        return 0

    if normalization == "none":
        return score
    elif normalization == "left":
        return float(score)/math.log(storage.tokens[src])
    elif normalization == "right":
        return float(score)/math.log(storage.tokens[dest])
    elif normalization == "both":
        return float(score)/math.log(storage.tokens[src]*storage.tokens[dest])
    else:
        raise ValueError

In [None]:
ngrams_2_2 = TripletStorage(os.path.join(DATA_DIR, "ngrams_2_2"))

In [None]:
ngrams_1_2 = TripletStorage(os.path.join(DATA_DIR, "ngrams_1_2"))

In [None]:
ngrams_1_3 = TripletStorage(os.path.join(DATA_DIR, "ngrams_1_3.good"))

In [None]:
ngrams_2_3 = TripletStorage(os.path.join(DATA_DIR, "ngrams_2_3"))

In [7]:
storage_storage = {
    (2,2) : ngrams_2_2,
    (1,3) : ngrams_1_3,
    (1,2) : ngrams_1_2,
    (2,3) : ngrams_2_3
}

In [4]:
def one_score_nonlp(q, ans, first_ngram_size=2, second_ngram_size=2, topN=5, normalization="both"):
    
    scores = []

    def _chunks(llist, size):
        for i in range(0, len(llist) - size + 1):
            yield llist[i:i+size]

    tokens_q = q.split()
    tokens_ans = ans.split()


    for q_ngram in _chunks(tokens_q, first_ngram_size):
        for ans_ngram in _chunks(tokens_ans, second_ngram_size):
            score = score_norel(storage_storage[(first_ngram_size, second_ngram_size)], " ".join(q_ngram), " ".join(ans_ngram), 
                                normalization=normalization)
            scores.append(score)
    
    for q_ngram in _chunks(tokens_q, second_ngram_size):
        for ans_ngram in _chunks(tokens_ans, first_ngram_size):
            score = score_norel(storage_storage[(first_ngram_size, second_ngram_size)], " ".join(q_ngram), " ".join(ans_ngram), 
                                normalization=normalization)
            scores.append(score)

    scores = sorted(scores, reverse=True)
    if len(scores) == 0:
        return 0
    else:
        return float(sum(scores[:topN]))/min(topN, len(scores))

In [5]:
def make_df(is_train, first_ngram_size, second_ngram_size):
    total_cnt = 0
    correct_cnt = 0
    equal_cnt = 0

    all_scores = []
    if is_train:
        inpath = "training_set_cleaned.tsv"
        prefix = "training"
    else:
        inpath = "validation_set_cleaned.tsv"
        prefix = "validation"
        
    for line in open(os.path.join("/root/data/allen-ai-challenge/", inpath)):
        if is_train:
            _id, q, correct_ans, a, b, c, d = line.split('\t')
        else:
            _id, q, a, b, c, d = line.split('\t')
            correct_ans = None

        scores = [one_score_nonlp(q.decode("utf-8"), x.decode("utf-8"), 
                                  first_ngram_size=first_ngram_size, 
                                  second_ngram_size=second_ngram_size, 
                                  topN=20,normalization="both") for x in [a,b,c,d]]

        if len(numpy.unique(scores)) == 1:
                equal_cnt += 1
        else:
            picked = ["A","B","C","D"][numpy.argmax(scores)]

            if picked == correct_ans:
                correct_cnt += 1

            total_cnt += 1
            all_scores.append([_id] + scores)

            if total_cnt % 100 == 0:
                print "====%f====" % (float(correct_cnt)/total_cnt)

    print "precision:%f" % (float(correct_cnt)/total_cnt)
    print "total: %d, skipped: %d" % (total_cnt, equal_cnt)
    
    df = pandas.DataFrame.from_records(all_scores)
    df.to_csv(os.path.join(OUT_DIR, "%s_ngrams_%d_%d.csv" % (prefix, first_ngram_size, second_ngram_size)))

In [8]:
make_df(True, 2, 3)
make_df(False, 2, 3)

====0.540000====
====0.510000====
====0.486667====
====0.457500====
====0.442000====
precision:0.452763
total: 561, skipped: 1939
====0.000000====
====0.000000====
====0.000000====
precision:0.000000
total: 385, skipped: 7747


In [None]:
make_df(True, 2, 2)
make_df(False, 2, 2)

In [None]:
make_df(True, 1, 2)
make_df(False, 1, 2)

In [None]:
make_df(True, 1, 3)
make_df(False, 1, 3)

In [None]:
def merge_dfs(*dfs):
    for df in dfs:
        df.fillna(0)
    
    res_df = dfs[0]
    for i in range(1, len(dfs)):
        print res_df
        res_df = res_df.merge(dfs[i], how="outer", left_index=True, right_index=True)
        
        def appl(serie):
            return pandas.Series({
                "A": str(serie["A_x"]) + ";" + str(serie["A_y"]),
                "B": str(serie["B_x"]) + ";" + str(serie["B_y"]),
                "C": str(serie["C_x"]) + ";" + str(serie["C_y"]),
                "D": str(serie["D_x"]) + ";" + str(serie["D_y"]),
            })
        
        res_df = res_df.fillna(0).apply(appl, axis=1)
            
        
    return res_df

In [None]:
training_dfs = []
validation_dfs = []

for i in range(1,4):
    for j in range(i,4):
        try:
            tr_df = pandas.DataFrame.from_csv(os.path.join(DATA_DIR, "scores/training_ngrams_%d_%d.csv" % (i,j)))
            tr_df.columns = ["id","A","B","C","D"]
            tr_df.set_index("id", inplace=True)
            training_dfs.append(tr_df)
            
            val_df = pandas.DataFrame.from_csv(os.path.join(DATA_DIR, "scores/validation_ngrams_%d_%d.csv" % (i,j)))
            val_df.columns = ["id","A","B","C","D"]
            val_df.set_index("id", inplace=True)
            validation_dfs.append(val_df)
        except:
            pass

In [None]:
merged = merge_dfs(*validation_dfs)

In [None]:
merged.to_csv(os.path.join(OUT_DIR,"validation_merged_ngrams.tsv"), sep='\t', header=False)