In [1]:
from plotlib.loaders import *

import pandas as pd
from typing import Dict, List, Tuple
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import annoy
import nltk
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [2]:
def load_phrase_scores(path: str, feature: str) -> Dict[str, Tuple[str, int]]: 
    ind = {
        'recip_rank': 2,
        'recall_20': 3,
        'recall_100': 4,
        'ndcg': 5,
        'rbp@0.10': 6,
        'rbp@0.50': 7,
        'rbp@0.80': 8,
    }[feature]
    
    out = {}
    with open(path) as f:
        for line in f:
            parts = line.split()
            if parts[0] not in out: 
                out[parts[0]] = []
            v = float(parts[ind])
            inc = 1 
            if v < 0.0:
                inc = -1 
            elif v == 0.0:
                inc = 0 
            out[parts[0]].append((parts[1], inc))
            
    return out

In [3]:
topics = load_queries('/home/danlocke/go/src/github.com/dan-locke/phd-data/case-topics.json')

In [4]:
def load_vectors(path: str):
    embeddings = {}
    with open(path, 'r', encoding='utf-8') as f:
        f.readline()
        for line in f:
            values = line.rstrip().rsplit(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings[word] = coefs

    return embeddings

class EmbMethod:
    SUM = 0
    MEAN = 1 
    CF_SUM = 2 
    CF_MEAN = 3

def embed(tokens: List[str], embeddings, dim: int=100, method: EmbMethod = EmbMethod.SUM, coll_stats = None) -> np.array:
    e = []
    if method > EmbMethod.MEAN: 
        for x in tokens: 
            if x in embeddings:
                mul = coll_stats[x] if x in coll_stats else 1.0
                e.append(mul * embeddings[x])
    else: 
        e = [embeddings[x] for x in tokens if x in embeddings]
    if len(e) == 0:
        return np.zeros((dim,), dtype='float32')
    
    e = np.sum(e, axis=0)
    if method == EmbMethod.MEAN or method == EmbMethod.CF_MEAN:
        e /= len(tokens)
            
    return e

In [5]:
find = {
    'first', 
    'second', 
    'third',
    'fourth',
    'fifth',
    'sixth',
    'seventh',
    'eighth',
    'ninth',
    'tenth',
    'eleventh',
    'twelfth',
    'thirteenth',
    'fourteenth',
    'fifteenth',
    'sixteenth',
    'seventeenth',
    'eighteenth',
    'nineteenth',
    'twenty',
    'thirty',
    'fourty',
}

num_exclusion = {
    'amendment',
    'degree',
    'refusal'
}

find2 = {
    'north',
    'south',
    'east',
    'west'
}

find3 = {
    'set',
    'hold',
    'exemplary',
    'intending',
    'include',
    'including',
    'relies',
    'relied',
    'pursuant',
    'thereto',
    'behalf',
    'giving',
    'give',
    'apparently',
    'subsequently',
    'jurisdictional',
    'included',
    'includes',
    'appellant',
    'respondent',
    'resulting',
    'such',
    's',
    'x',
    't',
    'th',
    'thing',
    're',
    'subject'
}

find4 = {
    'nonetheless',
    'follows',
    'referred',
    'thereto',
    'behalf',
    'hastily',
    'instance',
    'instances',
    'such',
    's',
    'applicant',
    'applicants',
    'respondent',
    'respondents',
    'th',
    'x',
    'such',
    'll',
    'only',
    'way',
    're',
    'much',
}

#     consider 'made' and 'make' as exclusions

def filter_phrases(phrases, splitter=' '):
    d = isinstance(phrases, dict)
    if d:
        ret = {}
    else:
        ret = []
    for phrase in phrases:
        toks = phrase.split(splitter)
        if len(toks) < 2:
            continue 
        
        cont = False 
        cnt = 0

        if toks[-1] in find3 or len(toks[-1]) == 1:
            continue
        
        if len(toks) == 2 and toks[0].startswith('claim'):
            continue
            
        if len(toks) == 3 and toks[0] == 'intention' and toks[1] != 'to':
            continue
            
        if toks[0] == 'intention' and len(toks) == 2:
            continue
            
        if toks[0] in find4 or len(toks[0]) == 1:
            continue
            
        unique = set()
            
        num = False 
        num_ex = False
        for t, tok in enumerate(toks):
            unique.add(tok)
            
            if tok in find: 
                num = True 
            if tok in num_exclusion:
                num_ex = True
            if tok in find2:
                cnt += 1 
            if tok == 'nunc' and t != 0:
                cont = True 
                break
                
        if num and not num_ex: 
            continue
        
        if len(toks) > len(unique): 
            continue
    
        if cont or cnt > 1:
            continue 
            
        if d: 
            ret[phrase] = phrases[phrase]
        else:
            ret.append(phrase)
        
    return ret 

In [6]:
stop = {
    'the',
    'of',
    'to',
    'in', 
    'for',
    'that',
    'and',
    'on',
    'is',
    'be',
    'by',
    'a',
    'an',
    'was',
    'it',
    'as',
    'this',
    'which',
    'with',
    'have',
    'at',
    'been',
    'there',
    'no',
    'or',
    'from',
    'has',
    'any',
    'i',
    'would',
    'were',
    'had',
    'are',
    'if',
    'also',
    'before',
    'but',
    'his',
    'other',
    'those',
    'so',
    'he',
    'did',
    'its',
    'she',
    'hers',
    'their',
    'theirs'
}

In [7]:
embeddings = load_vectors('/home/danlocke/fastText/filtered-100d.vec')
embeddings = {k: v for k, v in embeddings.items() if k.islower() and k.isalpha()}
term_embeddings = [x for x in embeddings.values()]
term_lookup = [x for x in embeddings.keys()]

In [8]:
phrase_embeddings = load_vectors('/home/danlocke/fastText/para-phrase-100d.vec')

In [9]:
phrase_embeddings = filter_phrases(phrase_embeddings, '_')
phrase_embeddings_vecs = [x for x in phrase_embeddings.values()]
phrase_lookup = [x for x in phrase_embeddings.keys()]

In [10]:
scores = load_phrase_scores('expansion-changes.txt', 'ndcg')

In [11]:
def n_gram(tokens: List[str], n: int): 
    ret = []
    for i in range(0, len(tokens)-(n-1)):
           ret.append('_'.join(tokens[i:i+n]))
           
    return ret 

def get_phrases(text: str, phrase_lookup: Dict[str, np.array]): 
    found = {}
    tokens = [x for x in text.lower().replace(',', '').replace('?', '').replace('\'', '').replace('/', '').replace('’', '').split() if x not in stop]
    for grams in [n_gram(tokens, 2), n_gram(tokens, 3)]:
        for gram in grams:
            if gram in phrase_lookup:
                found[gram] = 1
                
    return found

In [12]:
ind = annoy.AnnoyIndex(100, 'angular')
for i, x in enumerate(phrase_embeddings_vecs):
    ind.add_item(i, x) 
ind.build(1024)

True

In [14]:
qry_phrases = {}
for topic in topics:
    phrases = get_phrases(topics[topic]['topic'], phrase_embeddings)
    phrases = [(x, phrase_embeddings[x], [phrase_lookup[y] for y in ind.get_nns_by_vector(phrase_embeddings[x], 25)]) for x in phrases]
    qry_phrases[topic] = phrases

In [34]:
X = []
y = []

def features(topic, phrases, qry_phrases):
    out = []
    topic = int(topic)
    qry_toks = [x for x in topics[topic]['topic'].lower().replace(',', '').replace('?', '').replace('\'', '').replace('/', '').replace('’', '').split() if x not in stop]
    qry_vec = embed(qry_toks, embeddings).reshape(1, -1)
    for phrase_tuple in phrases:
        phrase_vec = phrase_embeddings[phrase_tuple[0]].reshape(1, -1)
        
        phrase_toks = set(phrase_tuple[0].split('_'))
        
        phrase_tok_vec = embed(phrase_toks, embeddings).reshape(1, -1)
        matched_vec = None 
        matched_tok_vec = None 
        diff_toks_vec = None 
        num_diff_toks = 0
        num_phrase_toks = len(phrase_toks)
        orig_phrase = None 
        
        min_qry_phrase_sim = None
        max_qry_phrase_sim = None 
        
        num_top_match = 0 
        
        for x in qry_phrases[topic]:
            if phrase_tuple[0] in x[2]:
                num_top_match += 1 
                    
                # the phrase is a potential expansion of this original query phrase

                orig_phrase = x[0]
                matched_vec = phrase_embeddings[x[0]].reshape(1, -1)
                matched_toks = set(x[0].split('_'))
                matched_tok_vec = embed(matched_toks, embeddings).reshape(1, -1)
                
                if min_qry_phrase_sim is None: 
                    min_qry_phrase_sim = cosine_similarity(phrase_vec, matched_vec)[0][0]
                else:
                    s = cosine_similarity(phrase_vec, matched_vec)[0][0]
                    if s > min_qry_phrase_sim:
                        min_qry_phrase_sim = s 
                        
                if max_qry_phrase_sim is None: 
                    max_qry_phrase_sim = cosine_similarity(phrase_vec, matched_vec)[0][0]
                else:
                    s = cosine_similarity(phrase_vec, matched_vec)[0][0]
                    if s > max_qry_phrase_sim:
                        max_qry_phrase_sim = s 

                diff_toks = matched_toks - phrase_toks
                num_diff_toks = len(diff_toks)
                diff_toks_vec = embed(diff_toks, embeddings).reshape(1, -1)
             
        num_top_match = float(num_top_match) / float(len(qry_phrases[topic]))
                
        phrase_exp_sim = cosine_similarity(phrase_vec, matched_vec)[0][0]
        phrase_exp_token_sim = cosine_similarity(phrase_tok_vec, matched_tok_vec)[0][0]
        qry_exp_sim = cosine_similarity(qry_vec, phrase_tok_vec)[0][0]
        diff_toks_sim = cosine_similarity(diff_toks_vec, matched_tok_vec)[0][0]
        dist = nltk.edit_distance(phrase_tuple[0], x[0])
        
#         print(phrase_exp_sim, phrase_exp_token_sim, qry_exp_sim, dist, num_phrase_toks, num_diff_toks, num_top_match, phrase_tuple[1], phrase_tuple[0], orig_phrase)
        out.append((qry_exp_sim, dist, num_phrase_toks, num_diff_toks, num_top_match, min_qry_phrase_sim, max_qry_phrase_sim,))
    
    return out
    

for topic, phrases in scores.items():
    topic = int(topic)
    qry_toks = [x for x in topics[topic]['topic'].lower().replace(',', '').replace('?', '').replace('\'', '').replace('/', '').replace('’', '').split() if x not in stop]
    qry_vec = embed(qry_toks, embeddings).reshape(1, -1)
    for phrase_tuple in phrases:
        phrase_vec = phrase_embeddings[phrase_tuple[0]].reshape(1, -1)
        
        phrase_toks = set(phrase_tuple[0].split('_'))
        
        phrase_tok_vec = embed(phrase_toks, embeddings).reshape(1, -1)
        matched_vec = None 
        matched_tok_vec = None 
        diff_toks_vec = None 
        num_diff_toks = 0
        num_phrase_toks = len(phrase_toks)
        orig_phrase = None 
        
        min_qry_phrase_sim = None
        max_qry_phrase_sim = None 
        
        num_top_match = 0 
        
        for x in qry_phrases[topic]:
            if phrase_tuple[0] in x[2]:
                num_top_match += 1 
                    
                # the phrase is a potential expansion of this original query phrase

                orig_phrase = x[0]
                matched_vec = phrase_embeddings[x[0]].reshape(1, -1)
                matched_toks = set(x[0].split('_'))
                matched_tok_vec = embed(matched_toks, embeddings).reshape(1, -1)
                
                if min_qry_phrase_sim is None: 
                    min_qry_phrase_sim = cosine_similarity(phrase_vec, matched_vec)[0][0]
                else:
                    s = cosine_similarity(phrase_vec, matched_vec)[0][0]
                    if s > min_qry_phrase_sim:
                        min_qry_phrase_sim = s 
                        
                if max_qry_phrase_sim is None: 
                    max_qry_phrase_sim = cosine_similarity(phrase_vec, matched_vec)[0][0]
                else:
                    s = cosine_similarity(phrase_vec, matched_vec)[0][0]
                    if s > max_qry_phrase_sim:
                        max_qry_phrase_sim = s 

                diff_toks = matched_toks - phrase_toks
                num_diff_toks = len(diff_toks)
                diff_toks_vec = embed(diff_toks, embeddings).reshape(1, -1)
             
        num_top_match = float(num_top_match) / float(len(qry_phrases[topic]))
                
        phrase_exp_sim = cosine_similarity(phrase_vec, matched_vec)[0][0]
        phrase_exp_token_sim = cosine_similarity(phrase_tok_vec, matched_tok_vec)[0][0]
        qry_exp_sim = cosine_similarity(qry_vec, phrase_tok_vec)[0][0]
        diff_toks_sim = cosine_similarity(diff_toks_vec, matched_tok_vec)[0][0]
        dist = nltk.edit_distance(phrase_tuple[0], x[0])
        
#         print(phrase_exp_sim, phrase_exp_token_sim, qry_exp_sim, dist, num_phrase_toks, num_diff_toks, num_top_match, phrase_tuple[1], phrase_tuple[0], orig_phrase)
        X.append((qry_exp_sim, dist, num_phrase_toks, num_diff_toks, num_top_match, min_qry_phrase_sim, max_qry_phrase_sim,))
        y.append(phrase_tuple[1])
            
#     break

In [31]:
# print(len(X))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm

parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters, n_jobs=8)
# clf = make_pipeline(StandardScaler(), SVC())
clf.fit(X, y)

In [56]:
clf.score(X, y)

0.6167168674698795

In [57]:
topic_features = features('1', scores['1'], qry_phrases)
predicted = clf.predict(topic_features)

In [58]:
print(topic_features)

[(0.7650007, 16, 2, 1, 0.5, 0.8511563, 0.8511563), (0.698878, 18, 2, 2, 0.5, 0.83666945, 0.83666945), (0.66542274, 16, 2, 1, 0.5, 0.82565606, 0.82565606), (0.7586659, 18, 2, 1, 0.5, 0.8432792, 0.8432792), (0.7763271, 10, 3, 0, 0.5, 0.94302297, 0.94302297), (0.78233033, 10, 3, 0, 0.5, 0.95130545, 0.95130545), (0.7615414, 16, 2, 1, 0.5, 0.8150523, 0.8150523), (0.7853027, 17, 2, 1, 0.5, 0.81310725, 0.81310725), (0.7103939, 6, 2, 1, 0.5, 0.8461931, 0.8461931), (0.7971319, 16, 2, 1, 0.5, 0.8230698, 0.8230698), (0.7248093, 15, 2, 1, 0.5, 0.8413123, 0.8413123), (0.80394316, 16, 2, 0, 0.5, 0.87822735, 0.87822735), (0.8068176, 15, 2, 1, 0.5, 0.83948386, 0.83948386), (0.710752, 15, 2, 1, 0.5, 0.83719003, 0.83719003), (0.7657465, 3, 2, 1, 0.5, 0.94904065, 0.94904065), (0.81424856, 18, 2, 0, 0.5, 1.0, 1.0), (0.7367139, 17, 2, 1, 0.5, 0.86503106, 0.86503106), (0.7697695, 17, 2, 1, 0.5, 0.8303156, 0.8303156), (0.7674389, 16, 2, 1, 0.5, 0.83646023, 0.83646023), (0.7721431, 17, 2, 1, 0.5, 0.8238973, 0

In [59]:
print(scores['1'])
print(predicted)

[('bring_company', -1), ('liquidators_companies', -1), ('brewing_company', -1), ('reinstate_company', 1), ('insolvent_company_liquidation', 1), ('company_voluntary_liquidation', 1), ('putting_company', -1), ('bringing_company', -1), ('voluntary_liquidation', -1), ('taking_company', -1), ('keeping_company', -1), ('liquidation_company', -1), ('liquidate_company', -1), ('solvent_company', -1), ('companies_liquidation', -1), ('reinstating_company', -1), ('insolvent_company', -1), ('reducing_company', -1), ('giving_company', -1), ('allowing_company', -1), ('company_liquidation_pay', -1), ('company_liquidations', -1), ('choosing_company', -1), ('leaving_company', -1), ('liquidating_company', -1), ('company_liquidator', -1), ('companies_into_liquidation', -1), ('company_liquidators', -1), ('company_insolvent', -1), ('suing_company', -1), ('liquidator_company', -1), ('abetting_company', -1), ('company_liquidation', -1), ('company_into_liquidation', -1), ('paying_company', -1), ('liquidated_com

In [60]:
for i in range(len(predicted)):
    print(scores['1'][i], predicted[i])

('bring_company', -1) -1
('liquidators_companies', -1) 1
('brewing_company', -1) -1
('reinstate_company', 1) -1
('insolvent_company_liquidation', 1) -1
('company_voluntary_liquidation', 1) -1
('putting_company', -1) -1
('bringing_company', -1) -1
('voluntary_liquidation', -1) -1
('taking_company', -1) -1
('keeping_company', -1) -1
('liquidation_company', -1) -1
('liquidate_company', -1) -1
('solvent_company', -1) -1
('companies_liquidation', -1) 0
('reinstating_company', -1) 1
('insolvent_company', -1) -1
('reducing_company', -1) -1
('giving_company', -1) -1
('allowing_company', -1) -1
('company_liquidation_pay', -1) -1
('company_liquidations', -1) 0
('choosing_company', -1) -1
('leaving_company', -1) -1
('liquidating_company', -1) -1
('company_liquidator', -1) 0
('companies_into_liquidation', -1) -1
('company_liquidators', -1) 0
('company_insolvent', -1) -1
('suing_company', -1) -1
('liquidator_company', -1) -1
('abetting_company', -1) 1
('company_liquidation', -1) 1
('company_into_li