<h1> Document Vectorization </h1>

This experiment will be based on Google News pretrained Word2Vec and GloVe. 

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
import spacy
import en_core_web_md
import string 
import gensim.downloader
import math
import requests
from sklearn.preprocessing import StandardScaler
import time
import os
import json
import ast

In [35]:
NUM_OF_DOCUMENTS_TRAIN = 1000
NUM_OF_DOCUMENTS_TEST = 500
URL = "https://www.courtlistener.com/api/rest/v3/opinions/"


In [3]:
def get_document(file_name):
    data = ""
    with open(file_name) as json_file:
        data = json.load(json_file)
    return data["plain_text"].replace("\n", " ")


In [36]:
df = pd.DataFrame(columns = ["id", "document"])

i = 0
for file_name in [file for file in os.listdir("data/train/") if file.endswith('.json')]:
    try:
        document = get_document("data/train/" + file_name)
        df.loc[i] = [file_name, document.lower()]
        i += 1
    except Exception as e:
        continue

    

In [37]:
df_test = pd.DataFrame(columns = ["id", "document"])
i = 0
for file_name in [file for file in os.listdir("data/test/") if file.endswith('.json')]:
    try:
        document = get_document("data/test/" + file_name)
        df_test.loc[i] = [file_name, document.lower()]
        i += 1
    except Exception as e:
        continue

In [7]:
df.to_csv("train_pretrained.csv", sep='\t')

In [8]:
df_test.to_csv("test_pretrained.csv", sep='\t')

In [9]:
df

Unnamed: 0.1,Unnamed: 0,id,document
0,0,174995.json,united states court of appeals ...
1,1,174996.json,united states court of appeals ...
2,2,175074.json,united states court of appeals ...
3,3,175075.json,united states court of appeals ...
4,4,175076.json,united states court of appeals ...
...,...,...,...
1391,1391,198335.json,united states court of appeals\r ...
1392,1392,198336.json,united states court of appeals\r ...
1393,1393,198337.json,[not for publication--not to be cited as prec...
1394,1394,198338.json,[not for publication--not to be cited as prece...


<h2> Google News Word2Vec </h2>

In [10]:
word2vec = gensim.downloader.load('word2vec-google-news-300')

<h2> GloVe </h2>

In [11]:
glove = gensim.downloader.load('glove-wiki-gigaword-300')

In [12]:
def get_other_vars(measure, avg, wordEmb):
    if measure == cosine_similarity:
        strtype, asc = "cosine similarity", -1
    elif measure == euclidean_distances:
        strtype, asc = "euclidean distance", 1
    else:
        strtype, asc = "manhattan distance", 1
        
    if (avg == True) and (wordEmb == word2vec):
        dftrain = df_avg
    elif (avg == True) and (wordEmb == glove):
        dftrain = df_avg_glove
    elif (avg == False) and (wordEmb == glove):
        dftrain = df_sum_glove
    else:
        dftrain = df_sum
    return strtype, asc, dftrain

<h3> Function for fitting training sets for average and sum of vectors </h3>

In [33]:
def fit_training(model):
    df_sum = pd.DataFrame()
    df_avg = pd.DataFrame()
    
    for idx, row in df.iterrows():
        document = row['document'].translate(str.maketrans('', '', string.punctuation)).lstrip().rstrip()
        words = document.split()
        word_vec = np.zeros((300, ))
        for word in words:
            if word.lower() in model.key_to_index :
                word_vec += model[word.lower()]
            elif word in model.key_to_index :
                word_vec += model[word]
        if len(words) > 0:
            word_vec_avg = word_vec / len(words)
        else:
            word_vec_avg = 0
        word_vec = pd.Series(word_vec)
        df_sum = df_sum.append(pd.Series(word_vec), ignore_index=True)
        df_avg = df_avg.append(pd.Series(word_vec_avg), ignore_index=True)
    return df_sum, df_avg

In [14]:
def get_idx(type, n, document, model, avg):
    df_q = pd.DataFrame()
    document = document.translate(str.maketrans('', '', string.punctuation)).lstrip().rstrip()
    words = document.split()
    word_vec = np.zeros((300, ))
    
    for word in words:
        if word.lower() in model.key_to_index:
            word_vec += model[word.lower()]
        elif word in model.key_to_index:
            word_vec += model[word]
            
    if avg == True:
        word_vec = word_vec/len(words)
        
    df_q = df_q.append(pd.Series(word_vec), ignore_index=True)

    strtype, asc, dftrain = get_other_vars(type, avg, model)
    distances = type(df_q, dftrain).flatten()
    indexes = np.argsort(distances)[::asc]
    indexes = indexes[:n]
    
    return indexes, strtype

In [15]:
def dict_to_df(dict, csv_name):
    df_to_save = pd.DataFrame(dict.items())
    df_to_save.to_csv(csv_name + ".csv", sep='\t')

def df_to_dict(csv_name):
    data_frame = pd.read_csv(csv_name, sep = '\t')
    dict = data_frame.set_index('0').T.to_dict('list')
    return dict

<h3>  Inverse document frequency - IDF </h3>

In [16]:
idf = df_to_dict("idf.csv")

  dict = data_frame.set_index('0').T.to_dict('list')


In [17]:
# idf = {}
# for i in range(len(df.iloc[:, 1])):
#     tokens = df.iloc[i, 1].translate(str.maketrans('', '', string.punctuation)).lstrip().rstrip().split()
#     for w in tokens:
#         try:
#             idf[w.lower()].add(i)
#         except:
#             idf[w.lower()] = {i}

# for key, value in idf2.items():
#     idf[key] = math.log(len(df.iloc[:, 0]) / (len(value)+1))

In [18]:
def get_idx_idf(type, n, document, model, avg):
    df_q = pd.DataFrame()
    document = document.translate(str.maketrans('', '', string.punctuation)).lstrip().rstrip()
    words = document.split()
    word_vec = np.zeros((300, ))
    for word in words:
        idfCoeff = 1
        if word.lower() in idf:
            idfCoeff = idf[word.lower()][1]
        if word.lower() in model.key_to_index:
            word_vec += model[word.lower()]*idfCoeff
        elif word in model.key_to_index:
            word_vec += model[word]*idfCoeff
            
    if avg == True:
        word_vec = word_vec / len(words)
        
    df_q = df_q.append(pd.Series(word_vec), ignore_index=True)

    strtype, asc, dftrain = get_other_vars(type, avg, model)

    distances = type(df_q, dftrain).flatten()
    indexes = np.argsort(distances)[::asc]
    indexes = indexes[:n]
    
    return indexes, strtype

<h3>  Part-of-speech - POS </h3>

In [19]:
nlp = spacy.load("en_core_web_md")

In [20]:
pos = df_to_dict("pos.csv")

  dict = data_frame.set_index('0').T.to_dict('list')


In [21]:
# pos = {}

# for sentence in df.iloc[:, 1]:
#     for token in nlp(sentence.lower()):
#         pos[token.text] = {token.pos_ : None}
        
# for sentence in df_test.iloc[:, 1]:
#     for token in nlp(sentence.lower()):
#         pos[token.text] = {token.pos_ : None}

In [22]:
# for word, posTagAndVal in pos.items():
    
#     for tag in posTagAndVal:
#         if tag == "NOUN":
#             posTagAndVal[tag] = 0.7
#         elif tag == "PROPN": 
#             posTagAndVal[tag] = 0.9
#         elif tag == "ADJ":
#             posTagAndVal[tag] = 0.8
#         elif tag == "ADV":
#             posTagAndVal[tag] = 0.7
#         elif tag == "VERB":
#             posTagAndVal[tag] = 0.6
#         elif tag == "SPACE" or tag == "PUNCT":
#             posTagAndVal[tag] = 0
#         else:
#             posTagAndVal[tag] = 0.4

In [23]:
def get_idx_pos(type, n, document, model, avg):
    df_q = pd.DataFrame()
    document = document.translate(str.maketrans('', '', string.punctuation)).lstrip().rstrip()
    words = document.split()
    word_vec = np.zeros((300, ))
    for word in words:
        posCoeff = 1
        if word.lower() in pos:
            posCoeff = next(iter(ast.literal_eval(pos[word.lower()][1]).values()))
        if word.lower() in model.key_to_index:
            word_vec += model[word.lower()]*posCoeff
        elif word in model.key_to_index:
            word_vec += model[word]*posCoeff
            
    if avg == True:
        word_vec = word_vec / len(words)
        
    df_q = df_q.append(pd.Series(word_vec), ignore_index=True)

    strtype, asc, dftrain = get_other_vars(type, avg, model)

    distances = type(df_q, dftrain).flatten()
    indexes = np.argsort(distances)[::asc]
    indexes = indexes[:n]
    
    return indexes, strtype

<h3>   Named Entity Recognition - NER </h3>

In [24]:
ner = df_to_dict("ner.csv")

In [25]:
# ner = {}

# for sentence in df.iloc[:, 1]:
#     for token in nlp(sentence.lower()).ents:
#         ner[token.text] = {token.label_: None}
        
# for sentence in df_test.iloc[:, 1]:
#     for token in nlp(sentence.lower()).ents:
#         ner[token.text] = {token.label_: None}

In [26]:
# for word, nerTagAndVal in ner.items():
    
#     for tag in nerTagAndVal:
#         if (tag == "LOCATION") or (tag == "ORG") or (tag == "NORP") or (tag == "MONEY") or (tag == "WORK_OF_ART") or (tag == "LAW"):
#             nerTagAndVal[tag] = 1.75
#         elif (tag == "GPE") or (tag == "DATE") or (tag == "PERSON") or (tag == "FAC"): 
#             nerTagAndVal[tag] = 1.5
#         elif (tag == "ORDINAL") or (tag == "CARDINAL") or (tag == "PRODUCT") or (tag == "PERCENT") or (tag == "TIME"):
#             nerTagAndVal[tag] = 1.2
#         else:
#             nerTagAndVal[tag] = 1

In [27]:
def get_idx_ner(type, n, document, model, avg):
    df_q = pd.DataFrame()
    document = document.translate(str.maketrans('', '', string.punctuation)).lstrip().rstrip()
    words = document.split()
    word_vec = np.zeros((300, ))
    for word in words:
        nerCoeff = 1
        if word.lower() in ner:
            nerCoeff = next(iter(ast.literal_eval(ner[word.lower()][1]).values()))
        if word.lower() in model.key_to_index:
            word_vec += model[word.lower()]*nerCoeff
        elif word in model.key_to_index:
            word_vec += model[word]*nerCoeff
            
    if avg == True:
        word_vec = word_vec / len(words)
        
    df_q = df_q.append(pd.Series(word_vec), ignore_index=True)

    strtype, asc, dftrain = get_other_vars(type, avg, model)

    distances = type(df_q, dftrain).flatten()
    indexes = np.argsort(distances)[::asc]
    indexes = indexes[:n]
    
    return indexes, strtype 

In [28]:
def get_idx_pos_ner(type, n, document, model, avg):
    df_q = pd.DataFrame()
    document = document.translate(str.maketrans('', '', string.punctuation)).lstrip().rstrip()
    words = document.split()
    word_vec = np.zeros((300, ))
    for word in words:
        posCoeff = 1
        nerCoeff = 1
        if word.lower() in ner:
            nerCoeff = next(iter(ast.literal_eval(ner[word.lower()][1]).values()))
        if word.lower() in pos:
            posCoeff = next(iter(ast.literal_eval(pos[word.lower()][1]).values()))
        if word.lower() in model.key_to_index:
            word_vec += model[word.lower()]*nerCoeff*posCoeff
        elif word in model.key_to_index:
            word_vec += model[word]*nerCoeff*posCoeff
            
    if avg == True:
        word_vec = word_vec/len(words)
        
    df_q = df_q.append(pd.Series(word_vec), ignore_index=True)

    strtype, asc, dftrain = get_other_vars(type, avg, model)

    distances = type(df_q, dftrain).flatten()
    indexes = np.argsort(distances)[::asc]
    indexes = indexes[:n]
    
    return indexes, strtype

In [29]:
def get_idx_pos_idf(type, n, document, model, avg):
    df_q = pd.DataFrame()
    document = document.translate(str.maketrans('', '', string.punctuation)).lstrip().rstrip()
    words = document.split()
    word_vec = np.zeros((300, ))
    for word in words:
        idfCoeff = 1
        posCoeff = 1
        if word.lower() in pos:
            posCoeff = next(iter(ast.literal_eval(pos[word.lower()][1]).values()))
        if word.lower() in idf:
            idfCoeff = idf[word.lower()][1]
        if word.lower() in model.key_to_index:
            word_vec += model[word.lower()]*posCoeff*idfCoeff
        elif word in model.key_to_index:
            word_vec += model[word]*posCoeff*idfCoeff
            
    if avg == True:
        word_vec = word_vec/len(words)
        
    df_q = df_q.append(pd.Series(word_vec), ignore_index=True)

    strtype, asc, dftrain = get_other_vars(type, avg, model)

    distances = type(df_q, dftrain).flatten()
    indexes = np.argsort(distances)[::asc]
    indexes = indexes[:n]
    
    return indexes, strtype

In [30]:
def get_idx_ner_idf(type, n, document, model, avg):
    df_q = pd.DataFrame()
    document = document.translate(str.maketrans('', '', string.punctuation)).lstrip().rstrip()
    words = document.split()
    word_vec = np.zeros((300, ))
    for word in words:
        idfCoeff = 1
        nerCoeff = 1
        if word.lower() in ner:
            nerCoeff = next(iter(ast.literal_eval(ner[word.lower()][1]).values()))
        if word.lower() in idf:
            idfCoeff = idf[word.lower()][1]
        if word.lower() in model.key_to_index:
            word_vec += model[word.lower()]*nerCoeff*idfCoeff
        elif word in model.key_to_index:
            word_vec += model[word]*nerCoeff*idfCoeff
            
    if avg == True:
        word_vec = word_vec/len(words)
        
    df_q = df_q.append(pd.Series(word_vec), ignore_index=True)

    strtype, asc, dftrain = get_other_vars(type, avg, model)

    distances = type(df_q, dftrain).flatten()
    indexes = np.argsort(distances)[::asc]
    indexes = indexes[:n]
    
    return indexes, strtype

<h3> The Experiment </h3>

In [31]:
wordEmbs = [word2vec, glove]
functions = [get_idx, get_idx_idf, get_idx_pos, get_idx_ner, 
             get_idx_pos_ner, get_idx_pos_idf, get_idx_ner_idf]
avgs = [True, False]
measures = [cosine_similarity]


In [38]:
df_sum, df_avg = fit_training(word2vec)

id                                                174995.json
document                  united states court of appeals  ...
Name: 0, dtype: object
id                                                174996.json
document              united states court of appeals      ...
Name: 1, dtype: object
id                                                175074.json
document                 united states court of appeals   ...
Name: 2, dtype: object
id                                                175075.json
document              united states court of appeals      ...
Name: 3, dtype: object
id                                                175076.json
document              united states court of appeals      ...
Name: 4, dtype: object
id                                                175121.json
document              united states court of appeals      ...
Name: 5, dtype: object
id                                                194833.json
document                      united states court of app

id                                                194892.json
document                      united states court of appea...
Name: 65, dtype: object
id                                                194893.json
document    august 26, 1993   united states court of appea...
Name: 66, dtype: object
id                                                194894.json
document    august 26, 1993   united states court of appea...
Name: 67, dtype: object
id                                                194895.json
document    august 26, 1993   united states court of appea...
Name: 68, dtype: object
id                                                194896.json
document                    united states court of appeals...
Name: 69, dtype: object
id                                                194897.json
document                      united states court of appea...
Name: 70, dtype: object
id                                                194898.json
document                      united states court 

id                                                194952.json
document    <p>usage: wpd2html [option] <wordperfect docum...
Name: 125, dtype: object
id                                                194953.json
document      september 28, 1993                     [not ...
Name: 126, dtype: object
id                                                194954.json
document      september 28, 1993                   united ...
Name: 127, dtype: object
id                                                194955.json
document    september 28, 1993    [not for publication]   ...
Name: 128, dtype: object
id                                                194956.json
document                      united states court of appea...
Name: 129, dtype: object
id                                                194957.json
document                      united states court of appea...
Name: 130, dtype: object
id                                                194958.json
document    s   e   p   t   e   m   b   e   

id                                                195008.json
document    november 5, 1993      [not for publication]   ...
Name: 181, dtype: object
id                                                195009.json
document    november 5, 1993                     [not for ...
Name: 182, dtype: object
id                                                195010.json
document                      united states court of appea...
Name: 183, dtype: object
id                                                195378.json
document    may 20, 1994          [not for publication]   ...
Name: 184, dtype: object
id                                                195379.json
document                      united states court of appea...
Name: 185, dtype: object
id                                                195380.json
document                    united states court of appeals...
Name: 186, dtype: object
id                                                195381.json
document    may 23, 1994          [not for p

id                                                195434.json
document    june 20, 1994                     [not for pub...
Name: 240, dtype: object
id                                                195435.json
document                   united states court of appeals ...
Name: 241, dtype: object
id                                                195436.json
document    june 16, 1994     united states court of appea...
Name: 242, dtype: object
id                                                195437.json
document    june 17, 1994         [not for publication]   ...
Name: 243, dtype: object
id                                                195438.json
document    june 21, 1994     united states court of appea...
Name: 244, dtype: object
id                                                195439.json
document    june 21, 1994     united states court of appea...
Name: 245, dtype: object
id                                                195440.json
document                      united states 

id                                                195494.json
document    july 15, 1994     united states court of appea...
Name: 300, dtype: object
id                                                195495.json
document                    united states court of appeals...
Name: 301, dtype: object
id                                                195496.json
document                      united states court of appea...
Name: 302, dtype: object
id                                                195497.json
document                    united states court of appeals...
Name: 303, dtype: object
id                                                195498.json
document    july 18, 1994     united states court of appea...
Name: 304, dtype: object
id                                                195499.json
document                    united states court of appeals...
Name: 305, dtype: object
id                                                195500.json
document                      united states 

id                                                195552.json
document                            [not for publication] ...
Name: 358, dtype: object
id                                                195553.json
document    august 17, 1994                 united states ...
Name: 359, dtype: object
id                                                195554.json
document                    united states court of appeals...
Name: 360, dtype: object
id                                                195555.json
document    august 18, 1994                     [not for p...
Name: 361, dtype: object
id                                                195556.json
document    august 19, 1994                     [not for p...
Name: 362, dtype: object
id                                                195557.json
document    august 23, 1994                      [not for ...
Name: 363, dtype: object
id                                                195558.json
document    august 23, 1994   united states 

id                                                195607.json
document                      united states court of appea...
Name: 413, dtype: object
id                                                195608.json
document                      united states court of appea...
Name: 414, dtype: object
id                                                195609.json
document    september 19, 1994    [not for publication]   ...
Name: 415, dtype: object
id                                                195610.json
document    september 21, 1994                 united stat...
Name: 416, dtype: object
id                                                195611.json
document    september 21, 1994                 united stat...
Name: 417, dtype: object
id                                                195612.json
document    s   e   p   t   e   m   b   e   r     2   2   ...
Name: 418, dtype: object
id                                                195613.json
document                    united states co

id                                                195677.json
document                    united states court of appeals...
Name: 483, dtype: object
id                                                195678.json
document    october 20, 1994      [not for publication]   ...
Name: 484, dtype: object
id                                                195679.json
document    october 20, 1994                     [not for ...
Name: 485, dtype: object
id                                                195680.json
document    october 19, 1994                     [not for ...
Name: 486, dtype: object
id                                                195681.json
document    october 20, 1994      [not for publication]   ...
Name: 487, dtype: object
id                                                195682.json
document    october 19, 1994                     [not for ...
Name: 488, dtype: object
id                                                195683.json
document    october 20, 1994                

id                                                195748.json
document                        united states court of app...
Name: 554, dtype: object
id                                                195749.json
document                      united states court of appea...
Name: 555, dtype: object
id                                                195750.json
document     november 23, 1994       [not for publication]...
Name: 556, dtype: object
id                                                195751.json
document      november 25, 1994                     [not f...
Name: 557, dtype: object
id                                                195752.json
document                        united states court of app...
Name: 558, dtype: object
id                                                195753.json
document                        united states court of app...
Name: 559, dtype: object
id                                                195754.json
document                          [not for p

id                                                195810.json
document      december 29, 1994                     [not f...
Name: 616, dtype: object
id                                                195811.json
document                        united states court of app...
Name: 617, dtype: object
id                                                195812.json
document      december 29, 1994     [not for publication]\...
Name: 618, dtype: object
id                                                195813.json
document                      united states court of appea...
Name: 619, dtype: object
id                                                195814.json
document                      united states court of appea...
Name: 620, dtype: object
id                                                195815.json
document      december 30, 1994                     [not f...
Name: 621, dtype: object
id                                                195816.json
document      december 30. 1994             

id                                                195878.json
document      january 31, 1995                     [not fo...
Name: 684, dtype: object
id                                                195879.json
document      january 31, 1995                     [not fo...
Name: 685, dtype: object
id                                                195880.json
document                        united states court of app...
Name: 686, dtype: object
id                                                195881.json
document      february 22, 1995                  united st...
Name: 687, dtype: object
id                                                195882.json
document      february 1, 1995      [not for publication] ...
Name: 688, dtype: object
id                                                195883.json
document                        united states court of app...
Name: 689, dtype: object
id                                                195884.json
document                      united states 

id                                                195945.json
document                        united states court of app...
Name: 751, dtype: object
id                                                195946.json
document      may 5, 1995                     [not for pub...
Name: 752, dtype: object
id                                                195947.json
document                        united states court of app...
Name: 753, dtype: object
id                                                195948.json
document      march 9, 1995                 united states ...
Name: 754, dtype: object
id                                                195949.json
document                        united states court of app...
Name: 755, dtype: object
id                                                195950.json
document      march 6, 1995         [not for publication] ...
Name: 756, dtype: object
id                                                195951.json
document                        united state

id                                                196020.json
document                         [not for publication]    ...
Name: 826, dtype: object
id                                                196021.json
document      april 18, 1995        [not for publication]\...
Name: 827, dtype: object
id                                                196022.json
document                      united states court of appea...
Name: 828, dtype: object
id                                                196023.json
document      april 19, 1995                       [not fo...
Name: 829, dtype: object
id                                                196024.json
document                        united states court of app...
Name: 830, dtype: object
id                                                196025.json
document                        united states court of app...
Name: 831, dtype: object
id                                                196026.json
document      april 20, 1995          [not f

id                                                196080.json
document                      united states court of appea...
Name: 886, dtype: object
id                                                196081.json
document            united states court of appeals\r      ...
Name: 887, dtype: object
id                                                196082.json
document      may 23, 1995 [not for publication] united st...
Name: 888, dtype: object
id                                                196083.json
document      june 7, 1995      united states court of app...
Name: 889, dtype: object
id                                                196084.json
document      may 24, 1995            [not for publication...
Name: 890, dtype: object
id                                                196085.json
document      may 24, 1995          [not for publication] ...
Name: 891, dtype: object
id                                                196086.json
document      may 24, 1995                  

id                                                196157.json
document      june 19, 1995\r                     [not for...
Name: 948, dtype: object
id                                                196158.json
document      june 20, 1995\r                     [not for...
Name: 949, dtype: object
id                                                196159.json
document                      united states court of appea...
Name: 950, dtype: object
id                                                196160.json
document                        united states court of app...
Name: 951, dtype: object
id                                                196161.json
document                        united states court of app...
Name: 952, dtype: object
id                                                196162.json
document      june 23, 1995         [not for publication]\...
Name: 953, dtype: object
id                                                196163.json
document      june 27, 1995\r               

id                                                196255.json
document     august 23, 1995         [not for publication]...
Name: 1016, dtype: object
id                                                196256.json
document                        united states court of app...
Name: 1017, dtype: object
id                                                196257.json
document      august 23, 1995\r                     [not f...
Name: 1018, dtype: object
id                                                196258.json
document                      united states court of appea...
Name: 1019, dtype: object
id                                                196259.json
document      august 25, 1995\r                     [not f...
Name: 1020, dtype: object
id                                                196260.json
document      august 25, 1995         [not for publication...
Name: 1021, dtype: object
id                                                196261.json
document      august 25, 1995       [n

id                                                196319.json
document      september 26, 1995\r                     [no...
Name: 1080, dtype: object
id                                                196320.json
document      september 27, 1995\r                  united...
Name: 1081, dtype: object
id                                                196321.json
document      september 27, 1995\r                   unite...
Name: 1082, dtype: object
id                                                196322.json
document                      united states court of appea...
Name: 1083, dtype: object
id                                                196323.json
document     september 28, 1995      [not for publication]...
Name: 1084, dtype: object
id                                                196324.json
document                      united states court of appea...
Name: 1085, dtype: object
id                                                196325.json
document      september 29, 1995      

id                                                196389.json
document      november 20, 1995\r                 united s...
Name: 1150, dtype: object
id                                                196390.json
document     november 8, 1995      [not for publication]\r...
Name: 1151, dtype: object
id                                                196391.json
document     november 8, 1995        [not for publication]...
Name: 1152, dtype: object
id                                                196392.json
document      november 9, 1995      [not for publication]\...
Name: 1153, dtype: object
id                                                196393.json
document      november 17, 1995     [not for publication]\...
Name: 1154, dtype: object
id                                                196394.json
document      november 9, 1995      [not for publication]\...
Name: 1155, dtype: object
id                                                196395.json
document      november 9, 1995        

id                                                196449.json
document     december 4, 1995        [not for publication]...
Name: 1210, dtype: object
id                                                196450.json
document                        united states court of app...
Name: 1211, dtype: object
id                                                196451.json
document                        united states court of app...
Name: 1212, dtype: object
id                                                196452.json
document                        united states court of app...
Name: 1213, dtype: object
id                                                196453.json
document     december 5, 1995        [not for publication]...
Name: 1214, dtype: object
id                                                196454.json
document      december 6, 1995      [not for publication]\...
Name: 1215, dtype: object
id                                                196455.json
document                        united

id                                                196504.json
document                        united states court of app...
Name: 1265, dtype: object
id                                                196505.json
document     january 2, 1996         [not for publication]...
Name: 1266, dtype: object
id                                                196506.json
document      february 8, 1996  united states court of app...
Name: 1267, dtype: object
id                                                196507.json
document                      united states court of appea...
Name: 1268, dtype: object
id                                                196508.json
document                        united states court of app...
Name: 1269, dtype: object
id                                                196509.json
document                      united states court of appea...
Name: 1270, dtype: object
id                                                196510.json
document     january 9, 1996         [

id                                                196571.json
document      march 8, 1996     united states court of app...
Name: 1332, dtype: object
id                                                196572.json
document                        united states court of app...
Name: 1333, dtype: object
id                                                196573.json
document      february 13, 1996     [not for publication]\...
Name: 1334, dtype: object
id                                                196574.json
document      february 13, 1996       [not for publication...
Name: 1335, dtype: object
id                                                196575.json
document     february 13, 1996       [not for publication]...
Name: 1336, dtype: object
id                                                196576.json
document                        united states court of app...
Name: 1337, dtype: object
id                                                196577.json
document      february 14, 1996\r     

id                                                198337.json
document     [not for publication--not to be cited as prec...
Name: 1393, dtype: object
id                                                198338.json
document    [not for publication--not to be cited as prece...
Name: 1394, dtype: object
id                                                198339.json
document                 united states court of appeals\r ...
Name: 1395, dtype: object


In [39]:
df_sum_glove, df_avg_glove = fit_training(glove)

id                                                174995.json
document                  united states court of appeals  ...
Name: 0, dtype: object
id                                                174996.json
document              united states court of appeals      ...
Name: 1, dtype: object
id                                                175074.json
document                 united states court of appeals   ...
Name: 2, dtype: object
id                                                175075.json
document              united states court of appeals      ...
Name: 3, dtype: object
id                                                175076.json
document              united states court of appeals      ...
Name: 4, dtype: object
id                                                175121.json
document              united states court of appeals      ...
Name: 5, dtype: object
id                                                194833.json
document                      united states court of app

id          194891.json
document               
Name: 64, dtype: object
id                                                194892.json
document                      united states court of appea...
Name: 65, dtype: object
id                                                194893.json
document    august 26, 1993   united states court of appea...
Name: 66, dtype: object
id                                                194894.json
document    august 26, 1993   united states court of appea...
Name: 67, dtype: object
id                                                194895.json
document    august 26, 1993   united states court of appea...
Name: 68, dtype: object
id                                                194896.json
document                    united states court of appeals...
Name: 69, dtype: object
id                                                194897.json
document                      united states court of appea...
Name: 70, dtype: object
id                                      

id                                                194960.json
document    september 30, 1993      [not for publication] ...
Name: 133, dtype: object
id                                                194961.json
document                    united states court of appeals...
Name: 134, dtype: object
id                                                194962.json
document    october 4, 1993   united states court of appea...
Name: 135, dtype: object
id                                                194963.json
document    october 4, 1993   united states court of appea...
Name: 136, dtype: object
id                                                194964.json
document                      united states court of appea...
Name: 137, dtype: object
id                                                194965.json
document                    united states court of appeals...
Name: 138, dtype: object
id                                                194966.json
document                    united states co

id                                                195387.json
document    may 27, 1994            [not for publication] ...
Name: 193, dtype: object
id                                                195388.json
document    may 27, 1994                     [not for publ...
Name: 194, dtype: object
id                                                195389.json
document                      united states court of appea...
Name: 195, dtype: object
id                                                195390.json
document    may 27, 1994            [not for publication] ...
Name: 196, dtype: object
id                                                195391.json
document                    united states court of appeals...
Name: 197, dtype: object
id                                                195392.json
document    may 31, 1994                     [not for publ...
Name: 198, dtype: object
id                                                195393.json
document                      united states 

id                                                195442.json
document    june 22, 1994         [not for publication]  u...
Name: 248, dtype: object
id                                                195443.json
document                      united states court of appea...
Name: 249, dtype: object
id                                                195444.json
document                    united states court of appeals...
Name: 250, dtype: object
id                                                195445.json
document                      united states court of appea...
Name: 251, dtype: object
id                                                195446.json
document     [for copies of opinion with appendix, contact...
Name: 252, dtype: object
id                                                195447.json
document    june 23, 1994     united states court of appea...
Name: 253, dtype: object
id                                                195448.json
document    june 27, 1994                   

id                                                195503.json
document    july 19, 1994         [not for publication]   ...
Name: 309, dtype: object
id                                                195504.json
document                      united states court of appea...
Name: 310, dtype: object
id                                                195505.json
document                      united states court of appea...
Name: 311, dtype: object
id                                                195506.json
document                    united states court of appeals...
Name: 312, dtype: object
id                                                195507.json
document                      united states court of appea...
Name: 313, dtype: object
id                                                195508.json
document                      united states court of appea...
Name: 314, dtype: object
id                                                195509.json
document                        [not for pub

id                                                195574.json
document                          [not for publication]   ...
Name: 380, dtype: object
id                                                195575.json
document                        [not for publication]     ...
Name: 381, dtype: object
id                                                195576.json
document                        [not for publication]     ...
Name: 382, dtype: object
id                                                195577.json
document                    united states court of appeals...
Name: 383, dtype: object
id                                                195578.json
document                        [not for publication]     ...
Name: 384, dtype: object
id                                                195579.json
document                      united states court of appea...
Name: 385, dtype: object
id                                                195580.json
document                      united states 

id                                                195636.json
document    september 29, 1994                     [not fo...
Name: 442, dtype: object
id                                                195637.json
document                    united states court of appeals...
Name: 443, dtype: object
id                                                195638.json
document    september 29, 1994                     [not fo...
Name: 444, dtype: object
id                                                195639.json
document                      united states court of appea...
Name: 445, dtype: object
id                                                195640.json
document    september 30, 1994      [not for publication] ...
Name: 446, dtype: object
id                                                195641.json
document                    united states court of appeals...
Name: 447, dtype: object
id                                                195642.json
document    october 3, 1994         [not for

id                                                195702.json
document      october 27, 1994                       [not ...
Name: 508, dtype: object
id                                                195703.json
document      november 25, 1994 united states court of app...
Name: 509, dtype: object
id                                                195704.json
document                        united states court of app...
Name: 510, dtype: object
id                                                195705.json
document      october 28, 1994                     [not fo...
Name: 511, dtype: object
id                                                195706.json
document      november 4, 1994      [not for publication] ...
Name: 512, dtype: object
id                                                195707.json
document                      united states court of appea...
Name: 513, dtype: object
id                                                195708.json
document                        united state

id                                                195762.json
document      december 15, 1994                  united st...
Name: 568, dtype: object
id                                                195763.json
document     december 6, 1994      [not for publication]  ...
Name: 569, dtype: object
id                                                195764.json
document                        united states court of app...
Name: 570, dtype: object
id                                                195765.json
document     december 6, 199423    [not for publication]  ...
Name: 571, dtype: object
id                                                195766.json
document                        united states court of app...
Name: 572, dtype: object
id                                                195767.json
document                        united states court of app...
Name: 573, dtype: object
id                                                195768.json
document                      united states 

id                                                195820.json
document      january 4, 1995       [not for publication] ...
Name: 626, dtype: object
id                                                195821.json
document      january 5, 1995                     [not for...
Name: 627, dtype: object
id                                                195822.json
document                        united states court of app...
Name: 628, dtype: object
id                                                195823.json
document                        united states court of app...
Name: 629, dtype: object
id                                                195824.json
document      january 5, 1995                     [not for...
Name: 630, dtype: object
id                                                195825.json
document                        united states court of app...
Name: 631, dtype: object
id                                                195826.json
document     january 5, 1995         [not fo

id                                                195889.json
document                        united states court of app...
Name: 695, dtype: object
id                                                195890.json
document      february 3, 1995                     [not fo...
Name: 696, dtype: object
id                                                195891.json
document      february 3, 1995                     [not fo...
Name: 697, dtype: object
id                                                195892.json
document      february 3, 1995                     [not fo...
Name: 698, dtype: object
id                                                195893.json
document      february 7, 1995                     [not fo...
Name: 699, dtype: object
id                                                195894.json
document                            [not for publication] ...
Name: 700, dtype: object
id                                                195895.json
document      afebruary 7, 1995             

id                                                195952.json
document      april 19, 1995                 united states...
Name: 758, dtype: object
id                                                195953.json
document      march 9, 1995                     [not for p...
Name: 759, dtype: object
id                                                195954.json
document      march 10, 1995                     [not for ...
Name: 760, dtype: object
id                                                195955.json
document                      united states court of appea...
Name: 761, dtype: object
id                                                195956.json
document      march 21, 1995                 united states...
Name: 762, dtype: object
id                                                195957.json
document      march 13, 1995        [not for publication] ...
Name: 763, dtype: object
id                                                195958.json
document      march 13, 1995                

id                                                196021.json
document      april 18, 1995        [not for publication]\...
Name: 827, dtype: object
id                                                196022.json
document                      united states court of appea...
Name: 828, dtype: object
id                                                196023.json
document      april 19, 1995                       [not fo...
Name: 829, dtype: object
id                                                196024.json
document                        united states court of app...
Name: 830, dtype: object
id                                                196025.json
document                        united states court of app...
Name: 831, dtype: object
id                                                196026.json
document      april 20, 1995          [not for publication...
Name: 832, dtype: object
id                                                196027.json
document                        united state

id                                                196096.json
document      july 7, 1995      united states court of app...
Name: 899, dtype: object
id                                                196109.json
document                        united states court of app...
Name: 900, dtype: object
id                                                196110.json
document                      united states court of appea...
Name: 901, dtype: object
id                                                196111.json
document                        united states court of app...
Name: 902, dtype: object
id                                                196112.json
document      june 7, 1995      united states court of app...
Name: 903, dtype: object
id                                                196113.json
document                        united states court of app...
Name: 904, dtype: object
id                                                196114.json
document                        united state

id                                                196172.json
document      july 5, 1995          [not for publication]\...
Name: 963, dtype: object
id                                                196173.json
document      july 5, 1995\r                     [not for ...
Name: 964, dtype: object
id                                                196174.json
document      july 6, 1995          [not for publication]\...
Name: 965, dtype: object
id                                                196175.json
document      july 6, 1995\r                     [not for ...
Name: 966, dtype: object
id                                                196176.json
document      july 6, 1995          [not for publication]\...
Name: 967, dtype: object
id                                                196177.json
document      july 14, 1995\r                 united state...
Name: 968, dtype: object
id                                                196178.json
document      july 12, 1995\r               

id                                                196257.json
document      august 23, 1995\r                     [not f...
Name: 1018, dtype: object
id                                                196258.json
document                      united states court of appea...
Name: 1019, dtype: object
id                                                196259.json
document      august 25, 1995\r                     [not f...
Name: 1020, dtype: object
id                                                196260.json
document      august 25, 1995         [not for publication...
Name: 1021, dtype: object
id                                                196261.json
document      august 25, 1995       [not for publication]\...
Name: 1022, dtype: object
id                                                196262.json
document      august 25, 1995       [not for publication]\...
Name: 1023, dtype: object
id                                                196263.json
document      october 3, 1995   united

id                                                196322.json
document                      united states court of appea...
Name: 1083, dtype: object
id                                                196323.json
document     september 28, 1995      [not for publication]...
Name: 1084, dtype: object
id                                                196324.json
document                      united states court of appea...
Name: 1085, dtype: object
id                                                196325.json
document      september 29, 1995      [not for publication...
Name: 1086, dtype: object
id                                                196326.json
document                        united states court of app...
Name: 1087, dtype: object
id                                                196327.json
document                      united states court of appea...
Name: 1088, dtype: object
id                                                196328.json
document                        united

id                                                196390.json
document     november 8, 1995      [not for publication]\r...
Name: 1151, dtype: object
id                                                196391.json
document     november 8, 1995        [not for publication]...
Name: 1152, dtype: object
id                                                196392.json
document      november 9, 1995      [not for publication]\...
Name: 1153, dtype: object
id                                                196393.json
document      november 17, 1995     [not for publication]\...
Name: 1154, dtype: object
id                                                196394.json
document      november 9, 1995      [not for publication]\...
Name: 1155, dtype: object
id                                                196395.json
document      november 9, 1995        [not for publication...
Name: 1156, dtype: object
id                                                196396.json
document                      united s

id                                                196447.json
document                      united states court of appea...
Name: 1208, dtype: object
id                                                196448.json
document     december 4, 1995        [not for publication]...
Name: 1209, dtype: object
id                                                196449.json
document     december 4, 1995        [not for publication]...
Name: 1210, dtype: object
id                                                196450.json
document                        united states court of app...
Name: 1211, dtype: object
id                                                196451.json
document                        united states court of app...
Name: 1212, dtype: object
id                                                196452.json
document                        united states court of app...
Name: 1213, dtype: object
id                                                196453.json
document     december 5, 1995        [

id                                                196504.json
document                        united states court of app...
Name: 1265, dtype: object
id                                                196505.json
document     january 2, 1996         [not for publication]...
Name: 1266, dtype: object
id                                                196506.json
document      february 8, 1996  united states court of app...
Name: 1267, dtype: object
id                                                196507.json
document                      united states court of appea...
Name: 1268, dtype: object
id                                                196508.json
document                        united states court of app...
Name: 1269, dtype: object
id                                                196509.json
document                      united states court of appea...
Name: 1270, dtype: object
id                                                196510.json
document     january 9, 1996         [

id                                                196568.json
document                        united states court of app...
Name: 1329, dtype: object
id                                                196569.json
document                    united states court of appeals...
Name: 1330, dtype: object
id                                                196570.json
document                      united states court of appea...
Name: 1331, dtype: object
id                                                196571.json
document      march 8, 1996     united states court of app...
Name: 1332, dtype: object
id                                                196572.json
document                        united states court of app...
Name: 1333, dtype: object
id                                                196573.json
document      february 13, 1996     [not for publication]\...
Name: 1334, dtype: object
id                                                196574.json
document      february 13, 1996       

id                                                198331.json
document                 united states court of appeals\r ...
Name: 1387, dtype: object
id                                                198332.json
document                united states court of appeals\r  ...
Name: 1388, dtype: object
id                                                198333.json
document                    united states court of appeals...
Name: 1389, dtype: object
id                                                198334.json
document                united states court of appeals\r  ...
Name: 1390, dtype: object
id                                                198335.json
document                 united states court of appeals\r ...
Name: 1391, dtype: object
id                                                198336.json
document                united states court of appeals\r  ...
Name: 1392, dtype: object
id                                                198337.json
document     [not for publication--not

In [40]:
def get_rank(type, n, model, avg, function):
    results = pd.DataFrame(columns = ["verdict", "indexes"])
    indexes = []
    name = ""
    for i, document in enumerate(df_test.iloc[:, 1]):
        indexes, strtype = function(type, n, document, model, avg)
        results = results.append(
            { "verdict" : df_test.iloc[i, 0], 
              "indexes" : indexes}, ignore_index=True)
    return results

In [41]:
results = get_rank(cosine_similarity, 100, word2vec, False, get_idx)
results.to_csv("results/w2v_sum.csv", sep = "\t")

KeyboardInterrupt: 

In [None]:
results = get_rank(cosine_similarity, 100, word2vec, False, get_idx_idf)
results.to_csv("results/w2v_sum_idf.csv", sep = "\t")

In [None]:
results = get_rank(cosine_similarity, 100, word2vec, False, get_idx_pos)
results.to_csv("results/w2v_sum_pos.csv", sep = "\t")

In [None]:
results = get_rank(cosine_similarity, 100, word2vec, False, get_idx_ner)
results.to_csv("results/w2v_sum_ner.csv", sep = "\t")

In [None]:
results = get_rank(cosine_similarity, 100, word2vec, False, get_idx_pos_ner)
results.to_csv("results/w2v_sum_pos_ner.csv", sep = "\t")

In [None]:
results = get_rank(cosine_similarity, 100, word2vec, False, get_idx_ner_idf)
results.to_csv("results/w2v_sum_ner_idf.csv", sep = "\t")

In [None]:
results = get_rank(cosine_similarity, 100, glove, False, get_idx)
results.to_csv("results/glove_sum.csv", sep = "\t")

In [None]:
results = get_rank(cosine_similarity, 100, glove, False, get_idx_idf)
results.to_csv("results/glove_sum_idf.csv", sep = "\t")

In [None]:
results = get_rank(cosine_similarity, 100, glove, False, get_idx_pos)
results.to_csv("results/glove_sum_pos.csv", sep = "\t")

In [None]:
results = get_rank(cosine_similarity, 100, glove, False, get_idx_ner)
results.to_csv("results/glove_sum_ner.csv", sep = "\t")

In [None]:
results = get_rank(cosine_similarity, 100, glove, False, get_idx_pos_ner)
results.to_csv("results/glove_sum_pos_ner.csv", sep = "\t")

In [None]:
results = get_rank(cosine_similarity, 100, glove, False, get_idx_ner_idf)
results.to_csv("results/glove_sum_ner_idf.csv", sep = "\t")

In [84]:
def gridSearch():
    i = 0
    name = ""
    for wordEmb in wordEmbs:
        for function in functions:
            for avg in avgs:
                for measure in measures:
                    indexes = get_rank(measure, 100, wordEmb, avg, function)
                    break