In [6]:
import re
import json
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict
from itertools import combinations, permutations

import spacy
from thinc.api import set_gpu_allocator, require_gpu
import pyate

import tika
from tika import parser
from glob import glob
from pathlib import Path

from sentence_transformers import SentenceTransformer

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer, TreebankWordTokenizer

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# pytorch library
import gc
import GPUtil
import torch # the main pytorch library
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as f # the sub-library containing different functions for manipulating with tensors

# huggingface's transformers library
from transformers import BertModel, BertTokenizer, AutoTokenizer, AutoModel

%matplotlib inline

In [9]:
#!pip install GPUtil

In [10]:
set_gpu_allocator("pytorch")
print(require_gpu(0))
# torch.cuda.is_available()
# torch.cuda.current_device()
# torch.cuda.device_count()
#torch.cuda.get_device_name(0)
print(GPUtil.showUtilization())

True
| ID | GPU | MEM |
------------------
|  0 |  0% |  0% |
|  1 |  0% |  0% |
None


### Extract Text

In [11]:
def extract(pdf, remove_stop_words=False):
    stop_words = set(stopwords.words('english'))
    raw = parser.from_file(pdf)
    content = raw['content'].strip()
    datepat = r"[0-9]{0,2}/[0-9]{0,2}/[0-9]{0,2}, [0-9]{0,2}:[0-9]{0,2} [AP]M"
    sents = [s.strip() for s in sent_tokenize(content)]
    sents = [re.sub(datepat, '', s) for s in sents]
    tokenizer = RegexpTokenizer(r'\w+')
    pattern = r"Page [0-9]{0,3} of [0-9]{0,3}"
    urlpat = r'^https?:\/\/.*[\r\n]*'
    sents = [re.sub(pattern, '', s) for s in sents]
    sents = [re.sub(urlpat, '', s, flags=re.MULTILINE) for s in sents]
    if remove_stop_words:
        sent_tokens = [tokenizer.tokenize(s) for s in sents]
        stripped_sents = []
        for sent in sent_tokens:
            temp_sent = []
            for t in sent:
                if t.lower() not in stop_words:
                    temp_sent.append(t)
            stripped_sents.append(" ".join(temp_sent))
        return " ".join(stripped_sents)
    else:
        sents = [" ".join(tokenizer.tokenize(s)) for s in sents] # why??
        return " ".join(sents)

In [12]:
def extract_sents(pdf):
    raw = parser.from_file(pdf)
    content = raw['content'].strip()
    datepat = r"[0-9]{0,2}/[0-9]{0,2}/[0-9]{0,2}, [0-9]{0,2}:[0-9]{0,2} [AP]M"
    sents = [s.strip() for s in sent_tokenize(content)]
    sents = [re.sub(datepat, '', s) for s in sents]
    pattern = r"Page [0-9]{0,3} of [0-9]{0,3}"
    urlpat = r'^https?:\/\/.*[\r\n]*'
    sents = [re.sub(pattern, '', s) for s in sents]
    sents = [re.sub(urlpat, '', s, flags=re.MULTILINE) for s in sents]
    sents = [re.sub(r'\n', ' ', s) for s in sents]
    sents = [s for s in sents if s != '']
    tokenizer = RegexpTokenizer(r'\w+')
    sents = [" ".join(tokenizer.tokenize(s.lower())) for s in sents]
    return sents

In [13]:
def cosine_sim(doc, text, n=1):
    """
    Take a single doc from a corpus and a text string and find the similar sentences
    """
    t = []
    text = text.strip().lower()
    content = [s.strip().lower() for s in doc ]
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(content)
    Xquery = vectorizer.transform([text])
    sims = cosine_similarity(X, Xquery).flatten()
    top_n = np.argsort(sims)[::-1][:n]
    for i in top_n:
        t.append((doc[i], sims[i]))
    return t

## OpenVault

In [24]:
path = "../mr_minio/data/openvault"
pdfs = glob(path+'/*Jersey*Open*pdf')

openvault = " ".join([extract(pdf, remove_stop_words=True) for pdf in pdfs])

## Federos

In [21]:
path = "../mr_minio/data/openvault"
pdfs = glob(path+'/*Feder*pdf')

federos = " ".join([extract(pdf, remove_stop_words=True) for pdf in pdfs])

In [23]:
type(federos)

str

## Incognito

In [19]:
path = "../mr_minio/data/openvault"
pdfs = glob(path+'/*Vancouver*pdf')

incognito = " ".join([extract(pdf, remove_stop_words=True) for pdf in pdfs])

## Intraway

In [17]:
path = "../mr_minio/data/openvault"
pdfs = glob(path+'/*Buenos*pdf')

intraway = " ".join([extract(pdf, remove_stop_words=True) for pdf in pdfs])

## Company Similarity

In [18]:
stop_words = set(stopwords.words('english'))

corpus = [federos, incognito, intraway]

vectorizer = TfidfVectorizer(stop_words=stop_words)
#vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(corpus)
Xquery = vectorizer.transform([openvault])

sims = cosine_similarity(X, Xquery).flatten()

sims

array([0.3316285 , 0.44533993, 0.35677711])

## Per Document Similarity

In [None]:
path = "../mr_minio/data/openvault"
ovpdfs= glob(path+'/*Jersey*Open*pdf')

path = "../mr_minio/data/openvault"
fedpdfs = glob(path+'/*Feder*pdf')

path = "../mr_minio/data/openvault"
incogpdfs = glob(path+'/*Vancouver*pdf')

path = "../mr_minio/data/openvault"
intrapdfs = glob(path+'/*Buenos*pdf')

ovdocs = [extract(doc, remove_stop_words=True) for doc in ovpdfs]
feddocs = [extract(doc, remove_stop_words=True) for doc in fedpdfs]
incogdocs = [extract(doc, remove_stop_words=True) for doc in incogpdfs]
intradocs = [extract(doc, remove_stop_words=True) for doc in intrapdfs]

ovdict = defaultdict()
for i, d in enumerate(ovpdfs):
    filename = ovpdfs[i].split('/')[-1]
    ovdict[filename] = ovdocs[i]
    
feddict = defaultdict()
for i, d in enumerate(fedpdfs):
    filename = fedpdfs[i].split('/')[-1]
    feddict[filename] = feddocs[i]
    
incogdict = defaultdict()
for i, d in enumerate(incogpdfs):
    filename = incogpdfs[i].split('/')[-1]
    incogdict[filename] = incogdocs[i]
    
intradict = defaultdict()
for i, d in enumerate(intrapdfs):
    filename = intrapdfs[i].split('/')[-1]
    intradict[filename] = intradocs[i]
    
doctuples = []
for ov in ovdict.items():
    for fed in feddict.items():
        doctuples.append((ov[0], fed[0]))
        
for ov in ovdict.items():
    for incog in incogdict.items():
        doctuples.append((ov[0], incog[0]))
        
for ov in ovdict.items():
    for intra in intradict.items():
        doctuples.append((ov[0], intra[0]))

In [None]:
stop_words = set(stopwords.words('english'))

simscores = []
bp = "../mr_minio/data/openvault/"
for i in range(len(doctuples)):
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    X = vectorizer.fit_transform([extract(bp+doctuples[i][1], remove_stop_words=True).lower()])
    Xquery = vectorizer.transform([extract(bp+doctuples[i][0], remove_stop_words=True).lower()])
    sims = cosine_similarity(X, Xquery).flatten()
    simscores.append((doctuples[i], sims[0]))
simscores = dict(simscores)
simscores = {k: v for k, v in sorted(simscores.items(), key=lambda item: item[1], reverse=True)}
sc = list(simscores.items())[:5]

In [None]:
sc

## PyATE

https://spacy.io/universe/project/pyate

In [None]:
# bp = "../mr_minio/data/openvault/"
# ovdoc1 = extract(bp+sc[0][0][0], remove_stop_words=True)
# incogdoc2 = extract(bp+sc[0][0][1], remove_stop_words=True)

In [None]:
# nlp = spacy.load('en_core_web_trf')
# # nlp = spacy.load('en_core_web_lg')
# nlp.add_pipe("combo_basic") # or any of `basic`, `weirdness`, `term_extractor` or `cvalue`

In [None]:
# # del nlp
# gc.collect()
# torch.cuda.empty_cache()

In [None]:
# doc_ov = nlp(ovdoc1)
# ov_nc = [nc for nc in doc_ov.noun_chunks]
# ov_kt = [kt for kt in doc_ov._.combo_basic.sort_values(ascending=False).head(10).index]

In [None]:
# def gen_tuples(a, b):
#     return [(i,j.text) for j in b for i in a]

# # for a,b in gen_tuples(ov_kt, ov_nc):
# #     print(cosine_sim(a,b))

# ov_tups = gen_tuples(ov_kt, ov_nc)

In [None]:
# doc_ov = nlp(ovdoc1)
# print(doc_ov._.combo_basic.sort_values(ascending=False).head(10))

In [None]:
# [ent.text for ent in doc_ov.ents if ent.label_ == "ORG"]

In [None]:
# for chunk in doc_ov.noun_chunks:
    print(f"{chunk.text} | {chunk.root.text} | {chunk.label_} |")

In [None]:
# ov_nc = [nc for nc in doc_ov.noun_chunks]

In [None]:
# doc_incogdoc = nlp(incogdoc2)
# print(doc_incogdoc._.combo_basic.sort_values(ascending=False).head(10))

In [None]:
# [ent.text for ent in doc_incogdoc.ents if ent.label_ == "ORG"]

In [None]:
# nlp = spacy.load("en_core_web_lg")  # make sure to use larger package!
# doc1 = nlp(ovdoc1)
# doc2 = nlp(incogdoc2)
# doc1.similarity(doc2)

In [None]:
# nlp_latin = spacy.load("/srv/fasttext/en_vectors")
# doc1 = nlp(ovdoc1)
# doc2 = nlp(incogdoc2)
# doc1.similarity(doc2)

In [None]:
# set_gpu_allocator("pytorch")
# require_gpu(0)

# nlp = spacy.load("en_core_web_trf")
# for doc in nlp.pipe([ovdoc1, incogdoc2]):
#     tokvecs = doc._.trf_data.tensors[-1]
    



In [None]:
# set_gpu_allocator("pytorch")
# require_gpu(0)

# torch.cuda.empty_cache()

## BERT For Measuring Text Similarity

In [1]:
import gc
import GPUtil
import torch # the main pytorch library
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as f # the sub-library containing different functions for manipulating with tensors
from thinc.api import set_gpu_allocator, require_gpu
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import re
import tika
from tika import parser
from glob import glob

from nltk.tokenize import sent_tokenize, RegexpTokenizer

set_gpu_allocator("pytorch")
print(require_gpu(0))

True


In [2]:
def extract_sents(pdf):
    raw = parser.from_file(pdf)
    content = raw['content'].strip()
    datepat = r"[0-9]{0,2}/[0-9]{0,2}/[0-9]{0,2}, [0-9]{0,2}:[0-9]{0,2} [AP]M"
    sents = [s.strip() for s in sent_tokenize(content)]
    sents = [re.sub(datepat, '', s) for s in sents]
    pattern = r"Page [0-9]{0,3} of [0-9]{0,3}"
    urlpat = r'^https?:\/\/.*[\r\n]*'
    sents = [re.sub(pattern, '', s) for s in sents]
    sents = [re.sub(urlpat, '', s, flags=re.MULTILINE) for s in sents]
    sents = [re.sub(r'\n', ' ', s) for s in sents]
    sents = [s for s in sents if s != '']
    tokenizer = RegexpTokenizer(r'\w+')
    sents = [" ".join(tokenizer.tokenize(s.lower())) for s in sents]
    return sents

In [3]:
path = "../mr_minio/data/openvault"
ovpdfs= glob(path+'/*Jersey*Open*pdf')

path = "../mr_minio/data/openvault"
fedpdfs = glob(path+'/*Feder*pdf')

path = "../mr_minio/data/openvault"
incogpdfs = glob(path+'/*Vancouver*pdf')

path = "../mr_minio/data/openvault"
intrapdfs = glob(path+'/*Buenos*pdf')

ovdocs = [extract_sents(doc) for doc in ovpdfs]
feddocs = [extract_sents(doc) for doc in fedpdfs]
incogdocs = [extract_sents(doc) for doc in incogpdfs]
intradocs = [extract_sents(doc) for doc in intrapdfs]

ovsents = [sent for doc in ovdocs for sent in doc]
fedsents = [sent for doc in feddocs for sent in doc]
incogsents = [sent for doc in incogpdfs for sent in doc]
intragsents = [sent for doc in intradocs for sent in doc]

In [4]:
sentences = [
    "Three years later, the coffin was still full of Jello.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
    "The person box was packed with jelly many dozens of months later.",
    "He found a leprechaun in his walnut shell."
]

In [5]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer('bert-base-nli-mean-tokens')

sentence_embeddings = model.encode(sentences)
sentence_embeddings.shape

(4, 768)

In [6]:
cosine_similarity(
    [sentence_embeddings[0]],
    sentence_embeddings[1:]
)

array([[0.33088905, 0.7219258 , 0.5548363 ]], dtype=float32)

In [7]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

In [8]:
sentences = [
    "Three years later, the coffin was still full of Jello.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
    "The person box was packed with jelly many dozens of months later.",
    "He found a leprechaun in his walnut shell."
]

In [9]:
# initialize dictionary to store tokenized sentences
tokens = {'input_ids': [], 'attention_mask': []}

for sentence in sentences:
    # encode each sentence and append to dictionary
    new_tokens = tokenizer.encode_plus(sentence, max_length=128,
                                       truncation=True, padding='max_length',
                                       return_tensors='pt')
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

# reformat list of tensors into single tensor
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

In [10]:
outputs = model(**tokens)
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [11]:
embeddings = outputs.last_hidden_state
embeddings

tensor([[[-0.0692,  0.6230,  0.0354,  ...,  0.8033,  1.6314,  0.3281],
         [ 0.0367,  0.6842,  0.1946,  ...,  0.0848,  1.4747, -0.3008],
         [-0.0121,  0.6543, -0.0727,  ..., -0.0326,  1.7717, -0.6812],
         ...,
         [ 0.1953,  1.1085,  0.3390,  ...,  1.2826,  1.0114, -0.0728],
         [ 0.0902,  1.0288,  0.3297,  ...,  1.2940,  0.9865, -0.1113],
         [ 0.1240,  0.9737,  0.3933,  ...,  1.1359,  0.8768, -0.1043]],

        [[-0.3212,  0.8251,  1.0554,  ..., -0.1855,  0.1517,  0.3937],
         [-0.7146,  1.0297,  1.1217,  ...,  0.0331,  0.2382, -0.1563],
         [-0.2352,  1.1353,  0.8594,  ..., -0.4310, -0.0272, -0.2968],
         ...,
         [-0.5400,  0.3236,  0.7839,  ...,  0.0022, -0.2994,  0.2659],
         [-0.5643,  0.3187,  0.9576,  ...,  0.0342, -0.3030,  0.1878],
         [-0.5172,  0.3599,  0.9336,  ...,  0.0243, -0.2232,  0.1672]],

        [[-0.7576,  0.8399, -0.3792,  ...,  0.1271,  1.2514,  0.1365],
         [-0.6591,  0.7614, -0.4662,  ...,  0

In [12]:
embeddings.shape

torch.Size([4, 128, 768])

In [13]:
attention_mask = tokens['attention_mask']
attention_mask.shape

torch.Size([4, 128])

In [14]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape

torch.Size([4, 128, 768])

In [15]:
mask

tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 

In [16]:
masked_embeddings = embeddings * mask
masked_embeddings.shape

torch.Size([4, 128, 768])

In [17]:
masked_embeddings

tensor([[[-0.0692,  0.6230,  0.0354,  ...,  0.8033,  1.6314,  0.3281],
         [ 0.0367,  0.6842,  0.1946,  ...,  0.0848,  1.4747, -0.3008],
         [-0.0121,  0.6543, -0.0727,  ..., -0.0326,  1.7717, -0.6812],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.0000]],

        [[-0.3212,  0.8251,  1.0554,  ..., -0.1855,  0.1517,  0.3937],
         [-0.7146,  1.0297,  1.1217,  ...,  0.0331,  0.2382, -0.1563],
         [-0.2352,  1.1353,  0.8594,  ..., -0.4310, -0.0272, -0.2968],
         ...,
         [-0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0000,  0.0000],
         [-0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0000,  0.0000],
         [-0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0000,  0.0000]],

        [[-0.7576,  0.8399, -0.3792,  ...,  0.1271,  1.2514,  0.1365],
         [-0.6591,  0.7614, -0.4662,  ...,  0

In [18]:
summed = torch.sum(masked_embeddings, 1)
summed.shape

torch.Size([4, 768])

In [19]:
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

torch.Size([4, 768])

In [20]:
summed_mask

tensor([[15., 15., 15.,  ..., 15., 15., 15.],
        [22., 22., 22.,  ..., 22., 22., 22.],
        [15., 15., 15.,  ..., 15., 15., 15.],
        [14., 14., 14.,  ..., 14., 14., 14.]])

In [21]:
mean_pooled = summed / summed_mask
mean_pooled

tensor([[ 0.0745,  0.8637,  0.1795,  ...,  0.7734,  1.7247, -0.1803],
        [-0.3715,  0.9729,  1.0840,  ..., -0.2552, -0.2759,  0.0358],
        [-0.5030,  0.7950, -0.1240,  ...,  0.1441,  0.9704, -0.1791],
        [-0.2131,  1.0175, -0.8833,  ...,  0.7371,  0.1947, -0.3011]],
       grad_fn=<DivBackward0>)

In [23]:
# convert from PyTorch tensor to numpy array
mean_pooled = mean_pooled.detach().cpu().numpy()

# calculate
cosine_similarity(
    [mean_pooled[0]],
    mean_pooled[1:]
)

array([[0.33088908, 0.7219258 , 0.5548364 ]], dtype=float32)