<h1> Document Vectorization </h1>

This experiment will be based on Google News pretrained Word2Vec and GloVe. 

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
import spacy
import en_core_web_md
import string 
import gensim.downloader
import math
import requests
from sklearn.preprocessing import StandardScaler
import time
import os
import json
import ast

In [2]:
NUM_OF_DOCUMENTS_TRAIN = 1000
NUM_OF_DOCUMENTS_TEST = 500
URL = "https://www.courtlistener.com/api/rest/v3/opinions/"


In [3]:
def get_document(file_name):
    data = ""
    with open(file_name) as json_file:
        data = json.load(json_file)
    return data["plain_text"].replace("\n", " ")


In [4]:
df = pd.DataFrame(columns = ["id", "document"])

i = 0
for file_name in [file for file in os.listdir("data/train/") if file.endswith('.json')]:
    try:
        document = get_document("data/train/" + file_name)
        df.loc[i] = [file_name, document.lower()]
        i += 1
    except Exception as e:
        continue

    

In [5]:
df_test = pd.DataFrame(columns = ["id", "document"])
i = 0
for file_name in [file for file in os.listdir("data/test/") if file.endswith('.json')]:
    try:
        document = get_document("data/test/" + file_name)
        df_test.loc[i] = [file_name, document.lower()]
        i += 1
    except Exception as e:
        continue

In [6]:
df.to_csv("train_pretrained.csv", sep='\t')

In [7]:
df_test.to_csv("test_pretrained.csv", sep='\t')

In [8]:
df

Unnamed: 0,id,document
0,174995.json,united states court of appeals ...
1,174996.json,united states court of appeals ...
2,175074.json,united states court of appeals ...
3,175075.json,united states court of appeals ...
4,175076.json,united states court of appeals ...
...,...,...
1391,198335.json,united states court of appeals\r ...
1392,198336.json,united states court of appeals\r ...
1393,198337.json,[not for publication--not to be cited as prec...
1394,198338.json,[not for publication--not to be cited as prece...


<h2> Google News Word2Vec </h2>

In [9]:
word2vec = gensim.downloader.load('word2vec-google-news-300')

<h2> GloVe </h2>

In [10]:
glove = gensim.downloader.load('glove-wiki-gigaword-300')

In [11]:
def get_other_vars(measure, avg, wordEmb):
    if measure == cosine_similarity:
        strtype, asc = "cosine similarity", -1
    elif measure == euclidean_distances:
        strtype, asc = "euclidean distance", 1
    else:
        strtype, asc = "manhattan distance", 1
        
    if (avg == True) and (wordEmb == word2vec):
        dftrain = df_avg
    elif (avg == True) and (wordEmb == glove):
        dftrain = df_avg_glove
    elif (avg == False) and (wordEmb == glove):
        dftrain = df_sum_glove
    else:
        dftrain = df_sum
    return strtype, asc, dftrain

<h3> Function for fitting training sets for average and sum of vectors </h3>

In [12]:
def fit_training(model):
    df_sum = pd.DataFrame()
    df_avg = pd.DataFrame()
    
    for idx, row in df.iterrows():
        document = row['document'].translate(str.maketrans('', '', string.punctuation)).lstrip().rstrip()
        words = document.split()
        word_vec = np.zeros((300, ))
        for word in words:
            if word.lower() in model.key_to_index :
                word_vec += model[word.lower()]
            elif word in model.key_to_index :
                word_vec += model[word]
        if len(words) > 0:
            word_vec_avg = word_vec / len(words)
        else:
            word_vec_avg = 0
        word_vec = pd.Series(word_vec)
        df_sum = df_sum.append(pd.Series(word_vec), ignore_index=True)
        df_avg = df_avg.append(pd.Series(word_vec_avg), ignore_index=True)
    return df_sum, df_avg

In [13]:
def get_idx(type, n, document, model, avg):
    df_q = pd.DataFrame()
    document = document.translate(str.maketrans('', '', string.punctuation)).lstrip().rstrip()
    words = document.split()
    word_vec = np.zeros((300, ))
    
    for word in words:
        if word.lower() in model.key_to_index:
            word_vec += model[word.lower()]
        elif word in model.key_to_index:
            word_vec += model[word]
            
    if avg == True:
        word_vec = word_vec/len(words)
        
    df_q = df_q.append(pd.Series(word_vec), ignore_index=True)

    strtype, asc, dftrain = get_other_vars(type, avg, model)
    distances = type(df_q, dftrain).flatten()
    indexes = np.argsort(distances)[::asc]
    indexes = indexes[:n]
    
    return indexes, strtype

In [14]:
def dict_to_df(dict, csv_name):
    df_to_save = pd.DataFrame(dict.items())
    df_to_save.to_csv(csv_name + ".csv", sep='\t')

def df_to_dict(csv_name):
    data_frame = pd.read_csv(csv_name, sep = '\t')
    dict = data_frame.set_index('0').T.to_dict('list')
    return dict

<h3>  Inverse document frequency - IDF </h3>

In [15]:
idf = df_to_dict("idf.csv")

  dict = data_frame.set_index('0').T.to_dict('list')


In [16]:
# idf = {}
# for i in range(len(df.iloc[:, 1])):
#     tokens = df.iloc[i, 1].translate(str.maketrans('', '', string.punctuation)).lstrip().rstrip().split()
#     for w in tokens:
#         try:
#             idf[w.lower()].add(i)
#         except:
#             idf[w.lower()] = {i}

# for key, value in idf2.items():
#     idf[key] = math.log(len(df.iloc[:, 0]) / (len(value)+1))

In [17]:
def get_idx_idf(type, n, document, model, avg):
    df_q = pd.DataFrame()
    document = document.translate(str.maketrans('', '', string.punctuation)).lstrip().rstrip()
    words = document.split()
    word_vec = np.zeros((300, ))
    for word in words:
        idfCoeff = 1
        if word.lower() in idf:
            idfCoeff = idf[word.lower()][1]
        if word.lower() in model.key_to_index:
            word_vec += model[word.lower()]*idfCoeff
        elif word in model.key_to_index:
            word_vec += model[word]*idfCoeff
            
    if avg == True:
        word_vec = word_vec / len(words)
        
    df_q = df_q.append(pd.Series(word_vec), ignore_index=True)

    strtype, asc, dftrain = get_other_vars(type, avg, model)

    distances = type(df_q, dftrain).flatten()
    indexes = np.argsort(distances)[::asc]
    indexes = indexes[:n]
    
    return indexes, strtype

<h3>  Part-of-speech - POS </h3>

In [18]:
nlp = spacy.load("en_core_web_md")

In [19]:
pos = df_to_dict("pos.csv")

  dict = data_frame.set_index('0').T.to_dict('list')


In [20]:
# pos = {}

# for sentence in df.iloc[:, 1]:
#     for token in nlp(sentence.lower()):
#         pos[token.text] = {token.pos_ : None}
        
# for sentence in df_test.iloc[:, 1]:
#     for token in nlp(sentence.lower()):
#         pos[token.text] = {token.pos_ : None}

In [21]:
# for word, posTagAndVal in pos.items():
    
#     for tag in posTagAndVal:
#         if tag == "NOUN":
#             posTagAndVal[tag] = 0.7
#         elif tag == "PROPN": 
#             posTagAndVal[tag] = 0.9
#         elif tag == "ADJ":
#             posTagAndVal[tag] = 0.8
#         elif tag == "ADV":
#             posTagAndVal[tag] = 0.7
#         elif tag == "VERB":
#             posTagAndVal[tag] = 0.6
#         elif tag == "SPACE" or tag == "PUNCT":
#             posTagAndVal[tag] = 0
#         else:
#             posTagAndVal[tag] = 0.4

In [22]:
def get_idx_pos(type, n, document, model, avg):
    df_q = pd.DataFrame()
    document = document.translate(str.maketrans('', '', string.punctuation)).lstrip().rstrip()
    words = document.split()
    word_vec = np.zeros((300, ))
    for word in words:
        posCoeff = 1
        if word.lower() in pos:
            posCoeff = next(iter(ast.literal_eval(pos[word.lower()][1]).values()))
        if word.lower() in model.key_to_index:
            word_vec += model[word.lower()]*posCoeff
        elif word in model.key_to_index:
            word_vec += model[word]*posCoeff
            
    if avg == True:
        word_vec = word_vec / len(words)
        
    df_q = df_q.append(pd.Series(word_vec), ignore_index=True)

    strtype, asc, dftrain = get_other_vars(type, avg, model)

    distances = type(df_q, dftrain).flatten()
    indexes = np.argsort(distances)[::asc]
    indexes = indexes[:n]
    
    return indexes, strtype

<h3>   Named Entity Recognition - NER </h3>

In [23]:
ner = df_to_dict("ner.csv")

In [24]:
# ner = {}

# for sentence in df.iloc[:, 1]:
#     for token in nlp(sentence.lower()).ents:
#         ner[token.text] = {token.label_: None}
        
# for sentence in df_test.iloc[:, 1]:
#     for token in nlp(sentence.lower()).ents:
#         ner[token.text] = {token.label_: None}

In [25]:
# for word, nerTagAndVal in ner.items():
    
#     for tag in nerTagAndVal:
#         if (tag == "LOCATION") or (tag == "ORG") or (tag == "NORP") or (tag == "MONEY") or (tag == "WORK_OF_ART") or (tag == "LAW"):
#             nerTagAndVal[tag] = 1.75
#         elif (tag == "GPE") or (tag == "DATE") or (tag == "PERSON") or (tag == "FAC"): 
#             nerTagAndVal[tag] = 1.5
#         elif (tag == "ORDINAL") or (tag == "CARDINAL") or (tag == "PRODUCT") or (tag == "PERCENT") or (tag == "TIME"):
#             nerTagAndVal[tag] = 1.2
#         else:
#             nerTagAndVal[tag] = 1

In [26]:
def get_idx_ner(type, n, document, model, avg):
    df_q = pd.DataFrame()
    document = document.translate(str.maketrans('', '', string.punctuation)).lstrip().rstrip()
    words = document.split()
    word_vec = np.zeros((300, ))
    for word in words:
        nerCoeff = 1
        if word.lower() in ner:
            nerCoeff = next(iter(ast.literal_eval(ner[word.lower()][1]).values()))
        if word.lower() in model.key_to_index:
            word_vec += model[word.lower()]*nerCoeff
        elif word in model.key_to_index:
            word_vec += model[word]*nerCoeff
            
    if avg == True:
        word_vec = word_vec / len(words)
        
    df_q = df_q.append(pd.Series(word_vec), ignore_index=True)

    strtype, asc, dftrain = get_other_vars(type, avg, model)

    distances = type(df_q, dftrain).flatten()
    indexes = np.argsort(distances)[::asc]
    indexes = indexes[:n]
    
    return indexes, strtype 

In [27]:
def get_idx_pos_ner(type, n, document, model, avg):
    df_q = pd.DataFrame()
    document = document.translate(str.maketrans('', '', string.punctuation)).lstrip().rstrip()
    words = document.split()
    word_vec = np.zeros((300, ))
    for word in words:
        posCoeff = 1
        nerCoeff = 1
        if word.lower() in ner:
            nerCoeff = next(iter(ast.literal_eval(ner[word.lower()][1]).values()))
        if word.lower() in pos:
            posCoeff = next(iter(ast.literal_eval(pos[word.lower()][1]).values()))
        if word.lower() in model.key_to_index:
            word_vec += model[word.lower()]*nerCoeff*posCoeff
        elif word in model.key_to_index:
            word_vec += model[word]*nerCoeff*posCoeff
            
    if avg == True:
        word_vec = word_vec/len(words)
        
    df_q = df_q.append(pd.Series(word_vec), ignore_index=True)

    strtype, asc, dftrain = get_other_vars(type, avg, model)

    distances = type(df_q, dftrain).flatten()
    indexes = np.argsort(distances)[::asc]
    indexes = indexes[:n]
    
    return indexes, strtype

In [28]:
def get_idx_pos_idf(type, n, document, model, avg):
    df_q = pd.DataFrame()
    document = document.translate(str.maketrans('', '', string.punctuation)).lstrip().rstrip()
    words = document.split()
    word_vec = np.zeros((300, ))
    for word in words:
        idfCoeff = 1
        posCoeff = 1
        if word.lower() in pos:
            posCoeff = next(iter(ast.literal_eval(pos[word.lower()][1]).values()))
        if word.lower() in idf:
            idfCoeff = idf[word.lower()][1]
        if word.lower() in model.key_to_index:
            word_vec += model[word.lower()]*posCoeff*idfCoeff
        elif word in model.key_to_index:
            word_vec += model[word]*posCoeff*idfCoeff
            
    if avg == True:
        word_vec = word_vec/len(words)
        
    df_q = df_q.append(pd.Series(word_vec), ignore_index=True)

    strtype, asc, dftrain = get_other_vars(type, avg, model)

    distances = type(df_q, dftrain).flatten()
    indexes = np.argsort(distances)[::asc]
    indexes = indexes[:n]
    
    return indexes, strtype

In [29]:
def get_idx_ner_idf(type, n, document, model, avg):
    df_q = pd.DataFrame()
    document = document.translate(str.maketrans('', '', string.punctuation)).lstrip().rstrip()
    words = document.split()
    word_vec = np.zeros((300, ))
    for word in words:
        idfCoeff = 1
        nerCoeff = 1
        if word.lower() in ner:
            nerCoeff = next(iter(ast.literal_eval(ner[word.lower()][1]).values()))
        if word.lower() in idf:
            idfCoeff = idf[word.lower()][1]
        if word.lower() in model.key_to_index:
            word_vec += model[word.lower()]*nerCoeff*idfCoeff
        elif word in model.key_to_index:
            word_vec += model[word]*nerCoeff*idfCoeff
            
    if avg == True:
        word_vec = word_vec/len(words)
        
    df_q = df_q.append(pd.Series(word_vec), ignore_index=True)

    strtype, asc, dftrain = get_other_vars(type, avg, model)

    distances = type(df_q, dftrain).flatten()
    indexes = np.argsort(distances)[::asc]
    indexes = indexes[:n]
    
    return indexes, strtype

<h3> The Experiment </h3>

In [30]:
wordEmbs = [word2vec, glove]
functions = [get_idx, get_idx_idf, get_idx_pos, get_idx_ner, 
             get_idx_pos_ner, get_idx_pos_idf, get_idx_ner_idf]
avgs = [True, False]
measures = [cosine_similarity]


In [31]:
df_sum, df_avg = fit_training(word2vec)

In [32]:
df_sum_glove, df_avg_glove = fit_training(glove)

In [33]:
def get_rank(type, n, model, avg, function):
    results = pd.DataFrame(columns = ["verdict", "indexes"])
    indexes = []
    name = ""
    for i, document in enumerate(df_test.iloc[:, 1]):
        indexes, strtype = function(type, n, document, model, avg)
        results = results.append(
            { "verdict" : df_test.iloc[i, 0], 
              "indexes" : indexes}, ignore_index=True)
    return results

In [34]:
results = get_rank(cosine_similarity, 100, word2vec, False, get_idx)
results.to_csv("results/w2v_sum.csv", sep = "\t")

In [35]:
results = get_rank(cosine_similarity, 100, word2vec, False, get_idx_idf)
results.to_csv("results/w2v_sum_idf.csv", sep = "\t")

KeyboardInterrupt: 

In [None]:
results = get_rank(cosine_similarity, 100, word2vec, False, get_idx_pos)
results.to_csv("results/w2v_sum_pos.csv", sep = "\t")

In [None]:
results = get_rank(cosine_similarity, 100, word2vec, False, get_idx_ner)
results.to_csv("results/w2v_sum_ner.csv", sep = "\t")

In [None]:
results = get_rank(cosine_similarity, 100, word2vec, False, get_idx_pos_ner)
results.to_csv("results/w2v_sum_pos_ner.csv", sep = "\t")

In [None]:
results = get_rank(cosine_similarity, 100, word2vec, False, get_idx_ner_idf)
results.to_csv("results/w2v_sum_ner_idf.csv", sep = "\t")

In [None]:
results = get_rank(cosine_similarity, 100, glove, False, get_idx)
results.to_csv("results/glove_sum.csv", sep = "\t")

In [None]:
results = get_rank(cosine_similarity, 100, glove, False, get_idx_idf)
results.to_csv("results/glove_sum_idf.csv", sep = "\t")

In [None]:
results = get_rank(cosine_similarity, 100, glove, False, get_idx_pos)
results.to_csv("results/glove_sum_pos.csv", sep = "\t")

In [None]:
results = get_rank(cosine_similarity, 100, glove, False, get_idx_ner)
results.to_csv("results/glove_sum_ner.csv", sep = "\t")

In [None]:
results = get_rank(cosine_similarity, 100, glove, False, get_idx_pos_ner)
results.to_csv("results/glove_sum_pos_ner.csv", sep = "\t")

In [None]:
results = get_rank(cosine_similarity, 100, glove, False, get_idx_ner_idf)
results.to_csv("results/glove_sum_ner_idf.csv", sep = "\t")

In [84]:
def gridSearch():
    i = 0
    name = ""
    for wordEmb in wordEmbs:
        for function in functions:
            for avg in avgs:
                for measure in measures:
                    indexes = get_rank(measure, 100, wordEmb, avg, function)
                    break