In [24]:
from collections import Counter
import itertools
import numpy as np
from scipy import stats
import pylab as pl
from sklearn import svm, linear_model, cross_validation

In [82]:
class rankFeature:
    
    def __init__(self, query, rel, fea):
        self.qid, self.relevancy, self.featureVector = query, rel, np.array(fea)
    
def parseFeature(line):
    elem = line.replace(' = ', '=').split()
    fv = [float(fea.split(':')[1]) for fea in elem[2:48]]
    return rankFeature(elem[1].split(':')[1], int(elem[0]), fv)

def featureExtraction(featureGroup):
    size = len(featureGroup)
#     print 'input size: %d' %size
    comb = itertools.combinations(range(size), 2)
    x, y, diff = [], [], []
    k = 0
    for i,j in comb:
        r_diff = featureGroup[i].relevancy - featureGroup[j].relevancy
        if r_diff == 0:
            continue
        x.append(featureGroup[i].featureVector - featureGroup[j].featureVector)        
        diff.append(r_diff)
        y.append(np.sign(r_diff))
        
        # balance the data
        if y[-1] != (-1) ** k:
            y[-1] *= -1
            x[-1] *= -1
            diff[-1] *= -1
        k += 1
        
    return np.array(x), np.array(y)

In [7]:
sample = '0 qid:10 1:0.039477 2:0.000000 3:0.750000 4:0.166667 5:0.040555 6:0.000000 7:0.000000 8:0.000000 9:0.000000 10:0.000000 11:0.036178 12:0.000000 13:0.751573 14:0.175557 15:0.039086 16:0.053319 17:0.187500 18:1.000000 19:0.111111 20:0.053668 21:0.605907 22:0.469614 23:0.827181 24:0.582661 25:0.000000 26:0.000000 27:0.000000 28:0.000000 29:0.266984 30:0.204123 31:0.323377 32:0.176450 33:0.287760 34:0.074987 35:0.002373 36:0.052280 37:0.565875 38:0.569440 39:0.769845 40:0.646567 41:0.073711 42:0.076923 43:0.034483 44:0.333333 45:0.218391 46:0.000000 #docid = GX037-87-3082362 inc = 0.588907390055858 prob = 0.3348'
elem = sample.replace(' = ', '=').split()
fv = [float(fea.split(':')[1]) for fea in elem[2:48]]
print fv

[0.039477, 0.0, 0.75, 0.166667, 0.040555, 0.0, 0.0, 0.0, 0.0, 0.0, 0.036178, 0.0, 0.751573, 0.175557, 0.039086, 0.053319, 0.1875, 1.0, 0.111111, 0.053668, 0.605907, 0.469614, 0.827181, 0.582661, 0.0, 0.0, 0.0, 0.0, 0.266984, 0.204123, 0.323377, 0.17645, 0.28776, 0.074987, 0.002373, 0.05228, 0.565875, 0.56944, 0.769845, 0.646567, 0.073711, 0.076923, 0.034483, 0.333333, 0.218391, 0.0]


In [66]:
with open('train.txt') as f:
    X_train = [parseFeature(l) for l in f.readlines()]
        

In [31]:
for i,j in itertools.combinations(range(4),2):
    print i,j



0 1
0 2
0 3
1 2
1 3
2 3


In [90]:
x_train, y_train = [], []
for grp in itertools.groupby(X_train, lambda i: i.qid):
    features = list(grp[1])
    nRel = Counter([x.relevancy for x in features])
    if len(nRel) > 1:
        x, y = featureExtraction(features)
        print 'DEBUG: query %s (%s) generates %d features, with %d +1.' %(grp[0], str(nRel), len(y), sum(y==1))
        x_train.extend(x)
        y_train.extend(y)

x_train, y_train = map(np.array, (x_train, y_train))

DEBUG: query 10 (Counter({0: 24, 1: 16})) generates 384 features, with 192 +1.
DEBUG: query 15 (Counter({0: 29, 1: 8, 2: 3})) generates 343 features, with 172 +1.
DEBUG: query 33 (Counter({0: 39, 2: 1})) generates 39 features, with 20 +1.
DEBUG: query 34 (Counter({0: 38, 1: 2})) generates 76 features, with 38 +1.
DEBUG: query 37 (Counter({0: 32, 2: 5, 1: 3})) generates 271 features, with 136 +1.
DEBUG: query 42 (Counter({1: 22, 0: 18})) generates 396 features, with 198 +1.
DEBUG: query 44 (Counter({0: 28, 1: 8, 2: 4})) generates 368 features, with 184 +1.
DEBUG: query 49 (Counter({0: 39, 1: 1})) generates 39 features, with 20 +1.
DEBUG: query 50 (Counter({0: 28, 1: 10, 2: 2})) generates 356 features, with 178 +1.
DEBUG: query 57 (Counter({0: 39, 2: 1})) generates 39 features, with 20 +1.
DEBUG: query 59 (Counter({1: 21, 0: 19})) generates 399 features, with 200 +1.
DEBUG: query 69 (Counter({0: 20, 1: 20})) generates 400 features, with 200 +1.
DEBUG: query 81 (Counter({0: 37, 1: 3})) ge

In [None]:
clf = svm.SVC(kernel='linear', C=.1)
clf.fit(x_train, y_train)