In [1]:
from plotlib.loaders import *

import pandas as pd
from typing import Dict, List, Tuple
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import annoy
import nltk
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import math

In [2]:
def load_phrase_scores(path: str, feature: str) -> Dict[str, Tuple[str, int]]: 
    ind = {
        'recip_rank': 2,
        'recall_20': 3,
        'recall_100': 4,
        'ndcg': 5,
        'rbp@0.10': 6,
        'rbp@0.50': 7,
        'rbp@0.80': 8,
    }[feature]
    
    out = {}
    with open(path) as f:
        for line in f:
            parts = line.split()
            if parts[0] not in out: 
                out[parts[0]] = []
            v = float(parts[ind])
            inc = 1 
            if v < 0.0:
                inc = 0 
#             elif v == 0.0:
#                 inc = 0 
            out[parts[0]].append((parts[1], inc))
            
    return out

In [3]:
def cnt_increases(path: str):
    increases = [0]*7
    decreases = [0]*7
    num = 0
    with open(path) as f:
        for line in f:
            num+=1
            parts = line.split()
            for i, p in enumerate(parts[2:]):
                v = float(p)
                if v > 0.0: 
                    increases[i-2] += 1
                elif v < 0.0:
                    decreases[i-2] += 1
    return increases, decreases, num

In [4]:
inc, dec, num = cnt_increases('expansion-changes.txt')

In [5]:
list(zip(inc, dec))

[(340, 160),
 (605, 3032),
 (285, 2494),
 (469, 2850),
 (585, 2961),
 (407, 2688),
 (393, 2019)]

In [6]:
pd.options.display.float_format = lambda x : '{:.0f}'.format(x) if int(x) == x else '{:,.4f}'.format(x)
print((pd.DataFrame(list(zip(inc, dec)), columns=['+', '-'], index=['RR', 'R@20', 'R@100', 'NDCG', 'RBP@0.1', 'RBP@0.5', 'RBP@0.8'])/num).to_latex())

\begin{tabular}{lrr}
\toprule
{} &      + &      - \\
\midrule
RR      & 0.0853 & 0.0402 \\
R@20    & 0.1519 & 0.7610 \\
R@100   & 0.0715 & 0.6260 \\
NDCG    & 0.1177 & 0.7154 \\
RBP@0.1 & 0.1468 & 0.7432 \\
RBP@0.5 & 0.1022 & 0.6747 \\
RBP@0.8 & 0.0986 & 0.5068 \\
\bottomrule
\end{tabular}



In [7]:
topics = load_queries('/home/danlocke/go/src/github.com/dan-locke/phd-data/case-topics.json')

In [4]:
def load_vectors(path: str):
    embeddings = {}
    with open(path, 'r', encoding='utf-8') as f:
        f.readline()
        for line in f:
            values = line.rstrip().rsplit(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings[word] = coefs

    return embeddings

class EmbMethod:
    SUM = 0
    MEAN = 1 
    CF_SUM = 2 
    CF_MEAN = 3

def embed(tokens: List[str], embeddings, dim: int=100, method: EmbMethod = EmbMethod.SUM, coll_stats = None) -> np.array:
    e = []
    if method > EmbMethod.MEAN: 
        for x in tokens: 
            if x in embeddings:
                mul = coll_stats[x] if x in coll_stats else 1.0
                e.append(mul * embeddings[x])
    else: 
        e = [embeddings[x] for x in tokens if x in embeddings]
    if len(e) == 0:
        return np.zeros((dim,), dtype='float32')
    
    e = np.sum(e, axis=0)
    if method == EmbMethod.MEAN or method == EmbMethod.CF_MEAN:
        e /= len(tokens)
            
    return e

In [5]:
find = {
    'first', 
    'second', 
    'third',
    'fourth',
    'fifth',
    'sixth',
    'seventh',
    'eighth',
    'ninth',
    'tenth',
    'eleventh',
    'twelfth',
    'thirteenth',
    'fourteenth',
    'fifteenth',
    'sixteenth',
    'seventeenth',
    'eighteenth',
    'nineteenth',
    'twenty',
    'thirty',
    'fourty',
}

num_exclusion = {
    'amendment',
    'degree',
    'refusal'
}

find2 = {
    'north',
    'south',
    'east',
    'west'
}

find3 = {
    'set',
    'hold',
    'exemplary',
    'intending',
    'include',
    'including',
    'relies',
    'relied',
    'pursuant',
    'thereto',
    'behalf',
    'giving',
    'give',
    'apparently',
    'subsequently',
    'jurisdictional',
    'included',
    'includes',
    'appellant',
    'respondent',
    'resulting',
    'such',
    's',
    'x',
    't',
    'th',
    'thing',
    're',
    'subject'
}

find4 = {
    'nonetheless',
    'follows',
    'referred',
    'thereto',
    'behalf',
    'hastily',
    'instance',
    'instances',
    'such',
    's',
    'applicant',
    'applicants',
    'respondent',
    'respondents',
    'th',
    'x',
    'such',
    'll',
    'only',
    'way',
    're',
    'much',
}

#     consider 'made' and 'make' as exclusions

def filter_phrases(phrases, splitter=' '):
    d = isinstance(phrases, dict)
    if d:
        ret = {}
    else:
        ret = []
    for phrase in phrases:
        toks = phrase.split(splitter)
        if len(toks) < 2:
            continue 
        
        cont = False 
        cnt = 0

        if toks[-1] in find3 or len(toks[-1]) == 1:
            continue
        
        if len(toks) == 2 and toks[0].startswith('claim'):
            continue
            
        if len(toks) == 3 and toks[0] == 'intention' and toks[1] != 'to':
            continue
            
        if toks[0] == 'intention' and len(toks) == 2:
            continue
            
        if toks[0] in find4 or len(toks[0]) == 1:
            continue
            
        unique = set()
            
        num = False 
        num_ex = False
        for t, tok in enumerate(toks):
            unique.add(tok)
            
            if tok in find: 
                num = True 
            if tok in num_exclusion:
                num_ex = True
            if tok in find2:
                cnt += 1 
            if tok == 'nunc' and t != 0:
                cont = True 
                break
                
        if num and not num_ex: 
            continue
        
        if len(toks) > len(unique): 
            continue
    
        if cont or cnt > 1:
            continue 
            
        if d: 
            ret[phrase] = phrases[phrase]
        else:
            ret.append(phrase)
        
    return ret 

In [6]:
stop = {
    'the',
    'of',
    'to',
    'in', 
    'for',
    'that',
    'and',
    'on',
    'is',
    'be',
    'by',
    'a',
    'an',
    'was',
    'it',
    'as',
    'this',
    'which',
    'with',
    'have',
    'at',
    'been',
    'there',
    'no',
    'or',
    'from',
    'has',
    'any',
    'i',
    'would',
    'were',
    'had',
    'are',
    'if',
    'also',
    'before',
    'but',
    'his',
    'other',
    'those',
    'so',
    'he',
    'did',
    'its',
    'she',
    'hers',
    'their',
    'theirs'
}

In [7]:
embeddings = load_vectors('/home/danlocke/fastText/filtered-100d.vec')
embeddings = {k: v for k, v in embeddings.items() if k.islower() and k.isalpha()}
term_embeddings = [x for x in embeddings.values()]
term_lookup = [x for x in embeddings.keys()]

In [8]:
phrase_embeddings = load_vectors('/home/danlocke/fastText/para-phrase-100d.vec')

In [9]:
phrase_embeddings = filter_phrases(phrase_embeddings, '_')
phrase_embeddings_vecs = [x for x in phrase_embeddings.values()]
phrase_lookup = [x for x in phrase_embeddings.keys()]

In [164]:
scores = load_phrase_scores('expansion-changes.txt', 'ndcg')

In [11]:
def n_gram(tokens: List[str], n: int): 
    ret = []
    for i in range(0, len(tokens)-(n-1)):
           ret.append('_'.join(tokens[i:i+n]))
           
    return ret 

def get_phrases(text: str, phrase_lookup: Dict[str, np.array]): 
    found = {}
    tokens = [x for x in text.lower().replace(',', '').replace('?', '').replace('\'', '').replace('/', '').replace('’', '').split() if x not in stop]
    for grams in [n_gram(tokens, 2), n_gram(tokens, 3)]:
        for gram in grams:
            if gram in phrase_lookup:
                found[gram] = 1
                
    return found

In [12]:
ind = annoy.AnnoyIndex(100, 'angular')
for i, x in enumerate(phrase_embeddings_vecs):
    ind.add_item(i, x) 
ind.build(1024)

True

In [14]:
qry_phrases = {}
for topic in topics:
    phrases = get_phrases(topics[topic]['topic'], phrase_embeddings)
    phrases = [(x, phrase_embeddings[x], [phrase_lookup[y] for y in ind.get_nns_by_vector(phrase_embeddings[x], 25)]) for x in phrases]
    qry_phrases[topic] = phrases

In [194]:
def load_coll_stats(path: str, total=None) -> Dict[str, float]:
    stats = {}
    
    idf = total is None
    if idf:
        total = 0.0
    with open(path) as f:
        for line in f:
            parts = line.split()
            if idf: 
                v = float(parts[-1])
                total += v
                stats[' '.join(parts[:-1])] = v
            else:
                stats[' '.join(parts[:-1])] = math.log(total / float(parts[-1])+1.0) + 1.0
    
    if idf: 
        for k, v in stats.items():
            stats[k] = math.log(total / v+1.0) + 1.0

    return stats

In [195]:
coll_stats = load_coll_stats('/home/danlocke/phd-generated/filtered-stop-top-tokens.txt')

In [203]:
X = []
y = []

def features(topic, phrases, qry_phrases):
    out = []
    topic = int(topic)
    qry_toks = [x for x in topics[topic]['topic'].lower().replace(',', '').replace('?', '').replace('\'', '').replace('/', '').replace('’', '').split() if x not in stop]
    qry_vec = embed(qry_toks, embeddings).reshape(1, -1)
    for phrase_tuple in phrases:
        phrase_vec = phrase_embeddings[phrase_tuple[0]].reshape(1, -1)
        
        phrase_toks = set(phrase_tuple[0].split('_'))
        
        phrase_tok_vec = embed(phrase_toks, embeddings).reshape(1, -1)
        matched_vec = None 
        matched_tok_vec = None 
        diff_toks_vec = None 
        num_diff_toks = 0
        num_phrase_toks = len(phrase_toks)
        orig_phrase = None 
        
        min_qry_phrase_sim = None
        max_qry_phrase_sim = None 
        
        num_top_match = 0 
        
        for x in qry_phrases[topic]:
            if phrase_tuple[0] in x[2]:
                num_top_match += 1 
                    
                # the phrase is a potential expansion of this original query phrase

                orig_phrase = x[0]
                matched_vec = phrase_embeddings[x[0]].reshape(1, -1)
                matched_toks = set(x[0].split('_'))
                matched_tok_vec = embed(matched_toks, embeddings).reshape(1, -1)
                
                if min_qry_phrase_sim is None: 
                    min_qry_phrase_sim = cosine_similarity(phrase_vec, matched_vec)[0][0]
                else:
                    s = cosine_similarity(phrase_vec, matched_vec)[0][0]
                    if s > min_qry_phrase_sim:
                        min_qry_phrase_sim = s 
                        
                if max_qry_phrase_sim is None: 
                    max_qry_phrase_sim = cosine_similarity(phrase_vec, matched_vec)[0][0]
                else:
                    s = cosine_similarity(phrase_vec, matched_vec)[0][0]
                    if s > max_qry_phrase_sim:
                        max_qry_phrase_sim = s 

                diff_toks = matched_toks - phrase_toks
                num_diff_toks = len(diff_toks)
                diff_toks_vec = embed(diff_toks, embeddings).reshape(1, -1)
             
        num_top_match = float(num_top_match) / float(len(qry_phrases[topic]))
                
        phrase_exp_sim = cosine_similarity(phrase_vec, matched_vec)[0][0]
        phrase_exp_token_sim = cosine_similarity(phrase_tok_vec, matched_tok_vec)[0][0]
        qry_exp_sim = cosine_similarity(qry_vec, phrase_tok_vec)[0][0]
        diff_toks_sim = cosine_similarity(diff_toks_vec, matched_tok_vec)[0][0]
        dist = nltk.edit_distance(phrase_tuple[0], x[0])
        
#         print(phrase_exp_sim, phrase_exp_token_sim, qry_exp_sim, dist, num_phrase_toks, num_diff_toks, num_top_match, phrase_tuple[1], phrase_tuple[0], orig_phrase)
        out.append((qry_exp_sim, dist, num_phrase_toks, num_diff_toks, num_top_match, min_qry_phrase_sim, max_qry_phrase_sim,))
    
    return out
    
method=EmbMethod.SUM

for topic, phrases in scores.items():
    topic = int(topic)
    if topic == 1:
        continue
    qry_toks = [x for x in topics[topic]['topic'].lower().replace(',', '').replace('?', '').replace('\'', '').replace('/', '').replace('’', '').split() if x not in stop]
    qry_vec = embed(qry_toks, embeddings, method=method, coll_stats=coll_stats).reshape(1, -1)
    for phrase_tuple in phrases:
        phrase_vec = phrase_embeddings[phrase_tuple[0]].reshape(1, -1)
        
        phrase_toks = set(phrase_tuple[0].split('_'))
        
        phrase_tok_vec = embed(phrase_toks, embeddings, method=method, coll_stats=coll_stats).reshape(1, -1)
        matched_vec = None 
        matched_tok_vec = None 
        diff_toks_vec = None 
        num_diff_toks = 0
        num_phrase_toks = len(phrase_toks)
        orig_phrase = None 
        
        min_qry_phrase_sim = None
        max_qry_phrase_sim = None 
        
        num_top_match = 0 
        
        for x in qry_phrases[topic]:
            if phrase_tuple[0] in x[2]:
                num_top_match += 1 
                    
                # the phrase is a potential expansion of this original query phrase

                orig_phrase = x[0]
                matched_vec = phrase_embeddings[x[0]].reshape(1, -1)
                matched_toks = set(x[0].split('_'))
                matched_tok_vec = embed(matched_toks, embeddings, method=method, coll_stats=coll_stats).reshape(1, -1)
                
                if min_qry_phrase_sim is None: 
                    min_qry_phrase_sim = cosine_similarity(phrase_vec, matched_vec)[0][0]
                else:
                    s = cosine_similarity(phrase_vec, matched_vec)[0][0]
                    if s > min_qry_phrase_sim:
                        min_qry_phrase_sim = s 
                        
                if max_qry_phrase_sim is None: 
                    max_qry_phrase_sim = cosine_similarity(phrase_vec, matched_vec)[0][0]
                else:
                    s = cosine_similarity(phrase_vec, matched_vec)[0][0]
                    if s > max_qry_phrase_sim:
                        max_qry_phrase_sim = s 

                diff_toks = matched_toks - phrase_toks
                num_diff_toks = len(diff_toks)
                diff_toks_vec = embed(diff_toks, embeddings, method=method, coll_stats=coll_stats).reshape(1, -1)
             
        num_top_match = float(num_top_match) / float(len(qry_phrases[topic]))
                
        phrase_exp_sim = cosine_similarity(phrase_vec, matched_vec)[0][0]
        phrase_exp_token_sim = cosine_similarity(phrase_tok_vec, matched_tok_vec)[0][0]
        qry_exp_sim = cosine_similarity(qry_vec, phrase_tok_vec)[0][0]
        diff_toks_sim = cosine_similarity(diff_toks_vec, matched_tok_vec)[0][0]
        diff_toks_qry_sim = cosine_similarity(diff_toks_vec, qry_vec)[0][0]
        dist = nltk.edit_distance(phrase_tuple[0], x[0])
        
        
        
        keep = True    
        if phrase_exp_sim < 0.75 and dist > 2:
            keep = False
#                 continue 

        if phrase_exp_sim < 0.7:
            keep = False 
#                 continue

        if diff_toks_sim == 0.0:
            keep = True
        elif diff_toks_sim < 0.5:
            keep = False
            
        if qry_exp_sim < 0.75:
            keep = False
            
        if num_diff_toks == 0:
            keep = True
        if phrase_exp_token_sim > 0.9:
            keep = True
        
#         if keep and phrase_tuple[1] == 0:
#             print('False positive -', phrase_exp_sim, phrase_exp_token_sim, qry_exp_sim, diff_toks_sim, diff_toks_qry_sim, num_top_match, dist, num_phrase_toks, num_diff_toks, phrase_tuple[1], phrase_tuple[0], orig_phrase, keep)
#         elif not keep and phrase_tuple[1] == 1:
        print(phrase_exp_sim, phrase_exp_token_sim, qry_exp_sim, diff_toks_sim, diff_toks_qry_sim, num_top_match, dist, num_phrase_toks, num_diff_toks, phrase_tuple[1], phrase_tuple[0], orig_phrase, keep)
#         X.append((qry_exp_sim, dist, num_phrase_toks, num_diff_toks, num_top_match, min_qry_phrase_sim, max_qry_phrase_sim,))
#         y.append(phrase_tuple[1])
            
    if topic > 8: 
        break 
#     break

0.82154965 0.78118753 0.6857758 0.8264465 0.78699005 0.5 14 2 1 1 expiration_date variation_date False
0.9468368 1.0000001 0.8685207 0.0 0.0 0.5 10 2 0 1 settlement_date date_settlement True
0.8192368 0.8353799 0.7669248 0.8459365 0.72736233 0.5 9 3 1 1 months_after_settlement date_settlement True
0.8968439 0.9050517 0.7910331 0.8459365 0.72736233 0.5 11 2 1 1 settlement_dates date_settlement True
0.81259847 0.737557 0.6482811 0.8459365 0.72736233 0.5 3 2 1 1 net_settlement date_settlement False
0.74921834 0.80630344 0.7013704 0.8264465 0.78699005 0.5 12 2 1 1 cessation_date variation_date False
0.7610389 0.80080897 0.69828653 0.8264465 0.78699005 0.5 13 2 1 1 calculation_date variation_date False
0.88468426 1.0 0.8952483 0.0 0.0 0.5 9 2 0 0 date_variation variation_date True
0.8156795 0.7689801 0.73492855 0.8459365 0.72736233 0.5 4 2 1 0 buyer_settlement date_settlement False
0.7290616 0.8775606 0.80433923 0.8264465 0.78699005 0.5 11 2 1 0 vary_date variation_date False
0.8589423 0.80

In [166]:
# print(len(X))

In [167]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.multiclass import OneVsRestClassifier

parameters = {'clf__kernel':('linear', 'rbf'), 'clf__C':[1, 10, ], 'clf__gamma': [0.001, 0.01, 1, 10, 100]}
# parameters =  {'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
#         'clf__C': np.logspace(-3, 1, 5),} # for log_reg
# parameters = {'clf__splitter': ('best', 'random'), 'clf__criterion': ('gini', 'entropy')} # for decision tree
# parameters = {'clf__var_smoothing': [1e-9, 1e-2]}

pipe = Pipeline([
    ('scale', StandardScaler()),
    ('clf', SVC()),
])
clf = GridSearchCV(pipe, parameters, n_jobs=8, verbose=2)
# clf = make_pipeline(StandardScaler(), SVC())
clf.fit(X, y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    3.6s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:   14.6s finished


GridSearchCV(estimator=Pipeline(steps=[('scale', StandardScaler()),
                                       ('clf', SVC())]),
             n_jobs=8,
             param_grid={'clf__C': [1, 10],
                         'clf__gamma': [0.001, 0.01, 1, 10, 100],
                         'clf__kernel': ('linear', 'rbf')},
             verbose=2)

In [169]:
clf.cv_results_

{'mean_fit_time': array([0.07072458, 0.10285225, 0.06879096, 0.12561274, 0.0706389 ,
        0.13745017, 0.07015314, 0.19507322, 0.07034044, 0.39453416,
        0.19936719, 0.13692632, 0.1998868 , 0.16001153, 0.20335956,
        0.18846006, 0.20252929, 0.23289852, 0.20129795, 0.43163505]),
 'std_fit_time': array([0.00772153, 0.00508726, 0.00674348, 0.00795802, 0.00648207,
        0.00713088, 0.00728654, 0.00910932, 0.00752077, 0.00730784,
        0.02212769, 0.00603143, 0.02332529, 0.03861135, 0.02109922,
        0.00688194, 0.02067971, 0.01447189, 0.01882749, 0.00653491]),
 'mean_score_time': array([0.01131749, 0.01497307, 0.01005878, 0.01447854, 0.00788898,
        0.0158433 , 0.00818162, 0.02413769, 0.00913258, 0.06827927,
        0.00797586, 0.01496611, 0.0081707 , 0.01438241, 0.00799994,
        0.01487956, 0.00789981, 0.02201042, 0.00790391, 0.06782985]),
 'std_score_time': array([3.08560881e-03, 5.47546642e-04, 2.96519083e-03, 3.47163768e-04,
        2.17059941e-04, 3.65958356e-

In [170]:
clf.score(X, y)

0.7593813387423936

In [171]:
topic_features = features('1', scores['1'], qry_phrases)
predicted = clf.predict(topic_features)

In [172]:
print(topic_features)

[(0.7650007, 16, 2, 1, 0.5, 0.8511563, 0.8511563), (0.698878, 18, 2, 2, 0.5, 0.83666945, 0.83666945), (0.66542274, 16, 2, 1, 0.5, 0.82565606, 0.82565606), (0.7586659, 18, 2, 1, 0.5, 0.8432792, 0.8432792), (0.7763271, 10, 3, 0, 0.5, 0.94302297, 0.94302297), (0.78233033, 10, 3, 0, 0.5, 0.95130545, 0.95130545), (0.7615414, 16, 2, 1, 0.5, 0.8150523, 0.8150523), (0.7853027, 17, 2, 1, 0.5, 0.81310725, 0.81310725), (0.7103939, 6, 2, 1, 0.5, 0.8461931, 0.8461931), (0.7971319, 16, 2, 1, 0.5, 0.8230698, 0.8230698), (0.7248093, 15, 2, 1, 0.5, 0.8413123, 0.8413123), (0.80394316, 16, 2, 0, 0.5, 0.87822735, 0.87822735), (0.8068176, 15, 2, 1, 0.5, 0.83948386, 0.83948386), (0.710752, 15, 2, 1, 0.5, 0.83719003, 0.83719003), (0.7657465, 3, 2, 1, 0.5, 0.94904065, 0.94904065), (0.81424856, 18, 2, 0, 0.5, 1.0, 1.0), (0.7367139, 17, 2, 1, 0.5, 0.86503106, 0.86503106), (0.7697695, 17, 2, 1, 0.5, 0.8303156, 0.8303156), (0.7674389, 16, 2, 1, 0.5, 0.83646023, 0.83646023), (0.7721431, 17, 2, 1, 0.5, 0.8238973, 0

In [173]:
print(scores['1'])
print(predicted)

[('bring_company', 0), ('liquidators_companies', 0), ('brewing_company', 0), ('reinstate_company', 1), ('insolvent_company_liquidation', 1), ('company_voluntary_liquidation', 1), ('putting_company', 0), ('bringing_company', 0), ('voluntary_liquidation', 0), ('taking_company', 0), ('keeping_company', 0), ('liquidation_company', 0), ('liquidate_company', 0), ('solvent_company', 0), ('companies_liquidation', 0), ('reinstating_company', 0), ('insolvent_company', 0), ('reducing_company', 0), ('giving_company', 0), ('allowing_company', 0), ('company_liquidation_pay', 0), ('company_liquidations', 0), ('choosing_company', 0), ('leaving_company', 0), ('liquidating_company', 0), ('company_liquidator', 0), ('companies_into_liquidation', 0), ('company_liquidators', 0), ('company_insolvent', 0), ('suing_company', 0), ('liquidator_company', 0), ('abetting_company', 0), ('company_liquidation', 0), ('company_into_liquidation', 0), ('paying_company', 0), ('liquidated_company', 0), ('running_company', 0

In [174]:
for i in range(len(predicted)):
    print(scores['1'][i], predicted[i])

('bring_company', 0) 0
('liquidators_companies', 0) 0
('brewing_company', 0) 0
('reinstate_company', 1) 0
('insolvent_company_liquidation', 1) 0
('company_voluntary_liquidation', 1) 0
('putting_company', 0) 0
('bringing_company', 0) 0
('voluntary_liquidation', 0) 0
('taking_company', 0) 0
('keeping_company', 0) 0
('liquidation_company', 0) 0
('liquidate_company', 0) 0
('solvent_company', 0) 0
('companies_liquidation', 0) 0
('reinstating_company', 0) 0
('insolvent_company', 0) 0
('reducing_company', 0) 0
('giving_company', 0) 0
('allowing_company', 0) 0
('company_liquidation_pay', 0) 0
('company_liquidations', 0) 0
('choosing_company', 0) 0
('leaving_company', 0) 0
('liquidating_company', 0) 0
('company_liquidator', 0) 0
('companies_into_liquidation', 0) 0
('company_liquidators', 0) 0
('company_insolvent', 0) 0
('suing_company', 0) 0
('liquidator_company', 0) 0
('abetting_company', 0) 0
('company_liquidation', 0) 0
('company_into_liquidation', 0) 0
('paying_company', 0) 0
('liquidated_c