In [12]:
import time
import os
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import key_words
from sentence_transformers.util import cos_sim
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
import pickle
def save_obj(obj:object,name:str):
    ext = '.pickle'
    with open(name + ext, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_obj(name:str)->object:
    ext = '.pickle'
    with open(name + ext, 'rb') as handle:
        return pickle.load(handle)

In [14]:
ess_1_model_answers = load_obj("data/essaySet_1_model_answers")
ess_2_model_answers = load_obj("data/essaySet_2_model_answers")
ess_3_model_answers = load_obj("data/essaySet_3_model_answers")
ess_4_model_answers = load_obj("data/essaySet_4_model_answers")
ess_5_model_answers = load_obj("data/essaySet_5_model_answers")
ess_6_model_answers = load_obj("data/essaySet_6_model_answers")
ess_7_model_answers = load_obj("data/essaySet_7_model_answers")
ess_8_model_answers = load_obj("data/essaySet_8_model_answers")
ess_9_model_answers = load_obj("data/essaySet_9_model_answers")
ess_10_model_answers = load_obj("data/essaySet_10_model_answers")

In [None]:
emb_dict = {'ess_1_model_answers_emb': ess_1_model_answers_emb,
 'ess_2_model_answers_emb': ess_2_model_answers_emb,
 'ess_3_model_answers_emb': ess_3_model_answers_emb,
 'ess_4_model_answers_emb': ess_4_model_answers_emb,
 'ess_5_model_answers_emb': ess_5_model_answers_emb,
 'ess_6_model_answers_emb': ess_6_model_answers_emb,
 'ess_7_model_answers_emb': ess_7_model_answers_emb,
 'ess_8_model_answers_emb': ess_8_model_answers_emb,
 'ess_9_model_answers_emb': ess_9_model_answers_emb,
 'ess_10_model_answers_emb': ess_10_model_answers_emb}
save_obj(emb_dict, "data/model_answer_emb")

In [24]:
emb_dict = load_obj("data/model_answer_emb")

In [9]:
import transformer_model

BERT = transformer_model.BERTModel()

In [15]:
def maximal_marginal_relevance(doc_embedding: np.ndarray,
        word_embeddings: np.ndarray,
        words,
        top_n = 5,
        diversity = 0.8):
    """
    Maximal Marginal Relevance algorithm for keyword extraction
    * from KeyBERT repository on github

    Args:
        doc_embedding (numpy.ndarray): embedding of shape (1, 768)
        word_embeddings (numpy.ndarray): embedding of shape (N, 768)
        words (List[str]): list of words
        top_n (Optional[int]): number of top words to extract
        diversity (Optional[float]): diversity of top words to extract

    Returns:
        List[Tuple[str, float]]: list of top_n words with their scores
    """
    # make sure 2d array
    if doc_embedding.ndim == 1:
        doc_embedding = doc_embedding.reshape(1, -1)

    # Extract similarity within words, and between words and the document
    # word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
    # word_similarity = cosine_similarity(word_embeddings)

    word_doc_similarity = np.array(cos_sim(word_embeddings, doc_embedding)).clip(-1, 1).round(6)
    word_similarity = np.array(cos_sim(word_embeddings, word_embeddings)).clip(-1, 1).round(6)

    # Initialize candidates and already choose best keyword/keyphras
    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        # Extract similarities within candidates and
        # between candidates and selected keywords/phrases
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # Calculate maximal_marginal_relevance
        mmr = (1-diversity) * candidate_similarities -\
            diversity * target_similarities.reshape(-1, 1)
        # if return mmr is empty
        if mmr.size == 0:
            continue
        mmr = candidates_idx[np.argmax(mmr)]

        # Update keywords & candidates
        keywords_idx.append(mmr)
        candidates_idx.remove(mmr)
    return [words[idx] for idx in keywords_idx]

In [22]:
def compare_keys_div(ind,essay,top_n=5,diversities=[0.8]):
    for div in diversities:
        keywords = list(map(lambda x: maximal_marginal_relevance(x[0].reshape(1, -1),x[1],x[2],top_n=top_n,diversity=div),
            zip(emb_dict[f'ess_{essay}_model_answers_emb'],
            model_candidate_emb,model_candidates)))
        print(div,keywords[ind])

# ESSAY SET 1

In [None]:
ess_1_model_answers[-1], "len", ess_1_model_answers[-1].split().__len__()

('In order to replicate this experiment, you would need to know what the samples were. Also, step three in their procedure is a little confusing and to replicate this experiment they may want to give a little bit more directions and be more specific. They might also want to say what size containers to use or how much vinegar to add to the container to allow someone also to replicate.',
 'len',
 70)

> kws extracted manually by me

know what the samples

little bit more directions and be more specific

size containers

how much vinegar to add

In [54]:
# (2,3)
compare_keys_div(-1,essay=1,top_n=9,diversities=list(np.arange(0,1,0.1).round(1)))

0.0 ['replicate experiment need', 'replicate experiment want', 'order replicate experiment', 'confusing replicate experiment', 'experiment need know', 'size containers use', 'replicate experiment', 'say size containers', 'experiment want little']
0.1 ['replicate experiment need', 'order replicate experiment', 'size containers use', 'replicate experiment want', 'know samples step', 'confusing replicate experiment', 'experiment need know', 'container allow replicate', 'little confusing replicate']
0.2 ['replicate experiment need', 'size containers use', 'know samples step', 'little confusing replicate', 'order replicate experiment', 'replicate experiment want', 'container allow replicate', 'experiment need know', 'confusing replicate experiment']
0.3 ['replicate experiment need', 'size containers use', 'know samples step', 'little confusing replicate', 'container allow replicate', 'use vinegar add', 'experiment want little', 'order replicate experiment', 'experiment need know']
0.4 ['rep

> (2,3) , n=7, div = 0.5, 0.6 ,0.7

In [13]:
ess_1_model_answers[-2], "len", ess_1_model_answers[-2].split().__len__()

("In order to replicate this group's procedure, I would need the following information: ^p 1. When they say in step one that they need to determine the mass of four different samples, they should name and list the samples that they plan on using. ^p 2. In step two when they say to pour vinegar in each of the four separate but identical containers, they should make mention or how much vinegar we should pour in. ^p 3. When they say in step  four that we should rinse each sample with distilled water, how much water should we use and how long should we rinse.",
 'len',
 105)

> kws extracted manually by me

need to determine the mass of four different samples

name and list the samples that they plan on using

say to pour vinegar

how much vinegar we should pour

rinse each sample with distilled water, how much water should we use and how long should we rinse.

In [45]:
compare_keys_div(-2,essay=2,top_n=9,diversities=list(np.arange(0,1,0.1).round(1)))

0.0 ['identical containers make', 'identical containers', 'mass different samples', 'use long', 'samples plan using', 'different samples', 'replicate group procedure', 'separate identical containers', 'different samples list']
0.1 ['identical containers make', 'use long', 'mass different samples', 'replicate group procedure', 'samples plan using', 'identical containers', 'different samples', 'separate identical containers', 'procedure need']
0.2 ['identical containers make', 'use long', 'mass different samples', 'replicate group procedure', 'samples plan using', 'procedure need', 'identical containers', 'using step', 'order replicate']
0.3 ['identical containers make', 'use long', 'mass different samples', 'replicate group procedure', 'samples plan using', 'procedure need', 'using step', 'order replicate', 'vinegar separate identical']
0.4 ['identical containers make', 'use long', 'mass different samples', 'replicate group procedure', 'samples plan using', 'procedure need', 'using step

In [14]:
# (1,2)
compare_keys_div(-2,essay=2,top_n=5,diversities=list(np.arange(0,1,0.1).round(1)))

0.0 ['identical containers', 'use long', 'different samples', 'samples', 'samples plan']
0.1 ['identical containers', 'use long', 'different samples', 'samples plan', 'samples']
0.2 ['identical containers', 'use long', 'different samples', 'samples plan', 'procedure']
0.3 ['identical containers', 'use long', 'different samples', 'samples plan', 'order replicate']
0.4 ['identical containers', 'use long', 'samples plan', 'different samples', 'procedure need']
0.5 ['identical containers', 'use long', 'samples plan', 'different samples', 'procedure need']
0.6 ['identical containers', 'use long', 'samples plan', 'different samples', 'determine mass']
0.7 ['identical containers', 'use long', 'samples plan', 'determine mass', 'sample distilled']
0.8 ['identical containers', 'pour say', 'use long', 'samples plan', 'following information']
0.9 ['identical containers', 'pour say', 'need following', 'samples plan', 'long rinse']


> (2,3) , n=7, div = 0.7

In [15]:
ess_1_model_answers[-3]

"After reading the group's procedure, additional information that a would need in order to replicate the experiment are that a need to know the size of the containers that the samples are put in, for they need to be identical. U also need to know tha amount of samples u need to put in each container in order to obtain tha same starting mass the group of students recieved from the samples. Another factor a should know is the amount no vinegar that u pour into the containers concerning the oud example of the experiment. Another good peice of information would be the location where the samples are drying and amount of sunlight."

> kws extracted manually by me

 know the size

samples are put in

know tha amount of samples

to put in each container

obtain tha same starting mass

Another factor amount no vinegar

location where the samples are drying and amount of sunlight.


In [23]:
compare_keys_div(-3,essay=2,top_n=5,diversities=list(np.arange(0,1,0.1).round(1)))

0.0 ['samples need identical', 'samples factor', 'recieved samples factor', 'size containers samples', 'containers samples need']
0.1 ['samples need identical', 'samples factor', 'recieved samples factor', 'group procedure additional', 'size containers samples']
0.2 ['samples need identical', 'recieved samples factor', 'group procedure additional', 'replicate experiment', 'size containers samples']
0.3 ['samples need identical', 'group procedure additional', 'recieved samples factor', 'size containers', 'replicate experiment']
0.4 ['samples need identical', 'additional information', 'size containers', 'recieved samples factor', 'replicate experiment']
0.5 ['samples need identical', 'tha starting mass', 'additional information', 'size containers', 'oud example experiment']
0.6 ['samples need identical', 'tha starting mass', 'additional information', 'pour containers concerning', 'oud example experiment']
0.7 ['samples need identical', 'tha starting mass', 'additional information', 'expe

In [47]:
# (2,3)
compare_keys_div(-3,essay=2,top_n=9,diversities=list(np.arange(0,1,0.1).round(1)))

0.0 ['samples need identical', 'samples factor', 'recieved samples factor', 'size containers samples', 'containers samples need', 'samples need', 'recieved samples', 'tha samples need', 'group procedure additional']
0.1 ['samples need identical', 'samples factor', 'recieved samples factor', 'group procedure additional', 'size containers samples', 'replicate experiment', 'tha samples need', 'containers samples need', 'samples need']
0.2 ['samples need identical', 'recieved samples factor', 'group procedure additional', 'replicate experiment', 'size containers samples', 'samples factor', 'tha samples need', 'additional information need', 'samples drying']
0.3 ['samples need identical', 'group procedure additional', 'recieved samples factor', 'size containers', 'replicate experiment', 'containers samples need', 'peice information', 'tha starting mass', 'samples factor']
0.4 ['samples need identical', 'additional information', 'size containers', 'recieved samples factor', 'replicate experi

In [22]:
# (2,3)
compare_keys_div(-3,essay=2,top_n=5,diversities=list(np.arange(0,1,0.1).round(1)))

0.0 ['samples need identical', 'samples factor', 'recieved samples factor', 'size containers samples', 'containers samples need']
0.1 ['samples need identical', 'samples factor', 'recieved samples factor', 'group procedure additional', 'size containers samples']
0.2 ['samples need identical', 'recieved samples factor', 'group procedure additional', 'replicate experiment', 'size containers samples']
0.3 ['samples need identical', 'group procedure additional', 'recieved samples factor', 'size containers', 'replicate experiment']
0.4 ['samples need identical', 'additional information', 'size containers', 'recieved samples factor', 'replicate experiment']
0.5 ['samples need identical', 'tha starting mass', 'additional information', 'size containers', 'oud example experiment']
0.6 ['samples need identical', 'tha starting mass', 'additional information', 'pour containers concerning', 'oud example experiment']
0.7 ['samples need identical', 'tha starting mass', 'additional information', 'expe

In [49]:
model_candidates = list(map(lambda ans: key_words.candidates_tokens(ans,n_gram_range=(3,3)),ess_1_model_answers))
model_candidate_emb = list(map(lambda cand:BERT.model.encode(cand),model_candidates))
compare_keys_div(-3,essay=2,top_n=10,diversities=list(np.arange(0,1,0.1).round(1)))

0.0 ['samples need identical', 'recieved samples factor', 'size containers samples', 'containers samples need', 'tha samples need', 'group procedure additional', 'replicate experiment need', 'samples need container', 'samples factor know', 'additional information need']
0.1 ['samples need identical', 'recieved samples factor', 'group procedure additional', 'size containers samples', 'tha samples need', 'replicate experiment need', 'containers samples need', 'additional information need', 'samples factor know', 'location samples drying']
0.2 ['samples need identical', 'recieved samples factor', 'group procedure additional', 'size containers samples', 'replicate experiment need', 'tha samples need', 'additional information need', 'containers samples need', 'information location samples', 'samples factor know']
0.3 ['samples need identical', 'group procedure additional', 'recieved samples factor', 'know size containers', 'oud example experiment', 'containers samples need', 'tha starting m

> (3,3) , n=5 , div =0.8

In [1]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

Downloading: 100%|██████████| 968/968 [00:00<00:00, 484kB/s]
Downloading: 100%|██████████| 190/190 [00:00<00:00, 94.9kB/s]
Downloading: 100%|██████████| 3.79k/3.79k [00:00<00:00, 3.79MB/s]
Downloading: 100%|██████████| 645/645 [00:00<00:00, 646kB/s]
Downloading: 100%|██████████| 122/122 [00:00<00:00, 61.0kB/s]
Downloading: 100%|██████████| 229/229 [00:00<00:00, 229kB/s]
Downloading: 100%|██████████| 471M/471M [06:09<00:00, 1.28MB/s] 
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 53.0kB/s]
Downloading: 100%|██████████| 5.07M/5.07M [00:02<00:00, 2.03MB/s]
Downloading: 100%|██████████| 239/239 [00:00<00:00, 239kB/s]
Downloading: 100%|██████████| 9.08M/9.08M [00:04<00:00, 1.94MB/s]
Downloading: 100%|██████████| 480/480 [00:00<00:00, 240kB/s]
Downloading: 100%|██████████| 14.8M/14.8M [00:07<00:00, 1.91MB/s]


In [24]:
model_candidates = list(map(lambda ans: key_words.candidates_tokens(ans,n_gram_range=(2,3)),ess_1_model_answers))
model_candidate_emb = list(map(lambda cand:model.encode(cand),model_candidates))

In [25]:
ind = -1
for div in list(np.arange(0,1,0.1).round(1)):
    keywords = list(map(lambda x: maximal_marginal_relevance(x[0].reshape(1, -1),x[1],x[2],top_n=9,diversity=div),
        zip(model.encode(ess_1_model_answers),
        model_candidate_emb,model_candidates)))
    print(div,keywords[ind])

0.0 ['replicate experiment need', 'replicate experiment want', 'confusing replicate experiment', 'replicate experiment', 'experiment need know', 'experiment need', 'samples step procedure', 'order replicate experiment', 'experiment want']
0.1 ['replicate experiment need', 'experiment need know', 'confusing replicate experiment', 'replicate experiment want', 'samples step procedure', 'replicate experiment', 'experiment need', 'order replicate experiment', 'containers use vinegar']
0.2 ['replicate experiment need', 'samples step procedure', 'experiment need know', 'containers use vinegar', 'confusing replicate experiment', 'replicate experiment want', 'replicate experiment', 'experiment need', 'order replicate experiment']
0.3 ['replicate experiment need', 'containers use vinegar', 'samples step procedure', 'experiment need know', 'confusing replicate experiment', 'order replicate experiment', 'container allow replicate', 'replicate experiment want', 'replicate experiment']
0.4 ['replica

> long (3,3) top_n = 7, div = 0.7

> mid (2,2) top_n = 5, div = 0.65

> short (1,1) top_n = 5, div = 0.5

In [20]:
ess_dict = {
    "ess_1_model_answers": ess_1_model_answers,
    "ess_2_model_answers": ess_2_model_answers,
    "ess_3_model_answers": ess_3_model_answers,
    "ess_4_model_answers": ess_4_model_answers,
    "ess_5_model_answers": ess_5_model_answers,
    "ess_6_model_answers": ess_6_model_answers,
    "ess_7_model_answers": ess_7_model_answers,
    "ess_8_model_answers": ess_8_model_answers,
    "ess_9_model_answers": ess_9_model_answers,
    "ess_10_model_answers": ess_10_model_answers,
}

In [29]:
model_candidates = list(map(lambda ans: key_words.candidates_tokens(ans,n_gram_range=(2,3)),ess_dict["ess_9_model_answers"]))
model_candidate_emb = list(map(lambda cand:BERT.model.encode(cand),model_candidates))


In [26]:
ess_9_model_answers[-1]

"The author has a very effective way of organizing the the article. He first introduces the beginning with some startling statements to get your attention, statements such as, 'Grab your telescope! Look up in the sky! It's a comet! It's a meteor!' (1).       Once grabbing the readers attention, he breaks the article into several segments labeled, 'What Is Space Junk?', 'Crash Course,' and 'Little Bits, But a Big Deal.'  All of the following are to condense the article into areas that focus on those specific ideas.     By doing so, it seems that the article was very effective in keeping the reader reading, instead of organizing the article into just one big clump.  Having the organization that it did made the article easy to read and very interesting at the same time, without leaving out any details."

 has a very effective way of organizing the the article

 Once grabbing the readers attention

Space Junk?

Crash Course

In [28]:
compare_keys_div(-1,essay=9,top_n=10,diversities=list(np.arange(0,1,0.1).round(1)))


0.0 ['organizing article introduces', 'organizing article just', 'breaks article segments', 'condense article areas', 'article introduces beginning', 'article segments labeled', 'way organizing article', 'article just big', 'instead organizing article', 'attention breaks article']
0.1 ['organizing article introduces', 'breaks article segments', 'organizing article just', 'condense article areas', 'article introduces beginning', 'article segments labeled', 'article just big', 'way organizing article', 'article areas focus', 'following condense article']
0.2 ['organizing article introduces', 'breaks article segments', 'article just big', 'condense article areas', 'organizing article just', 'article segments labeled', 'meteor grabbing readers', 'article introduces beginning', 'space junk crash', 'statements attention statements']
0.3 ['organizing article introduces', 'space junk crash', 'article just big', 'meteor grabbing readers', 'breaks article segments', 'article areas focus', 'conde

In [30]:
# 2,3
compare_keys_div(-1,essay=9,top_n=10,diversities=list(np.arange(0,1,0.1).round(1)))

0.0 ['organizing article introduces', 'article introduces', 'organizing article', 'organizing article just', 'breaks article segments', 'condense article areas', 'article introduces beginning', 'article segments labeled', 'condense article', 'way organizing article']
0.1 ['organizing article introduces', 'breaks article segments', 'organizing article just', 'condense article areas', 'article introduces', 'organizing article', 'article segments labeled', 'article introduces beginning', 'article just big', 'condense article']
0.2 ['organizing article introduces', 'breaks article segments', 'article just big', 'condense article areas', 'organizing article just', 'article introduces', 'article segments labeled', 'meteor grabbing readers', 'organizing article', 'space junk crash']
0.3 ['organizing article introduces', 'space junk crash', 'article just big', 'meteor grabbing readers', 'breaks article segments', 'article areas focus', 'condense article', 'organizing article just', 'statements

## one limiation 

https://link.springer.com/chapter/10.1007/978-3-030-79150-6_50#Sec10

is that keybert perform bad on small perform bad on docs of length of less than 2* top_n unique terms

In [None]:
ess_1_model_answers[0].split()

['You',
 'need',
 'to',
 'know',
 'how',
 'much',
 'vinegar',
 'was',
 'used',
 'in',
 'each',
 'container.']

In [None]:
['arabic', 'english', 'french', 'german', 'italian', 'japanese', 'korean', 'portuguese', 'russian', 'spanish']

['arabic',
 'english',
 'french',
 'german',
 'italian',
 'japanese',
 'korean',
 'portuguese',
 'russian',
 'spanish']

In [None]:
from nltk.corpus import stopwords

In [None]:
stop_words = stopwords.words('english')
len(set(stop_words))

179

In [None]:
import string
doc = 'You need to know how much vinegar was used in each container.'
# remove stopwords from doc
tokens = [w for w in set(doc.split()) if w not in stop_words]
# remove punctuation from tokens
tokens = [w.translate(str.maketrans('', '', string.punctuation)) for w in [doc]]

In [None]:
tokens,doc

(['You need to know how much vinegar was used in each container'],
 'You need to know how much vinegar was used in each container.')

In [33]:
def get_ngram_top_n_diversity(docs,threshold=0.7):
    lens = []
    tokens = []
    for doc in docs:
        tokens_ = [w.translate(str.maketrans('', '', string.punctuation)) for w in [doc]]
        tokens_ = [w for w in set(doc.split()) if w not in stop_words]
        tokens.append(tokens_)
        lens.append(len(tokens))

    lens = np.median(np.array(lens))
    if lens < 10:
        print("Too few tokens in the documents. Please check the input.")
        model_candidates = tokens
        model_candidate_emb = list(map(lambda cand:BERT.model.encode(cand),model_candidates))
        cands = list(map(lambda cand: np.array(cos_sim(model_answer_emb,cand)),model_candidate_emb))
        list(map(lambda cand: np.where(cand,cand>threshold),cands))
        lens = 100
    if lens <20 and lens>10 :
        print("mid")
        # MMR
        lens = 100
    else:
        print("long")
        # MMR
        lens = 200
    model_candidates = tokens
    model_candidate_emb = list(map(lambda cand:BERT.model.encode(cand),model_candidates))
    list(map(lambda st: np.array(cos_sim(model_answer_emb,st)),model_candidate_emb))
    print(model_candidate_emb[0].shape)
    # cos_sim(model_answer_emb, model_candidate_emb)
    del tokens

    return lens

In [39]:
ans_emb = BERT.model.encode("i am so dump as new")
cand = set("i am so dump as new".split())

In [43]:
cand

{'am', 'as', 'dump', 'i', 'new', 'so'}

In [61]:
cands = np.array(list(map(lambda x: np.array(cos_sim(ans_emb,BERT.model.encode(x))),cand)))
np.where(cands>0.4)[0]+1

array([4, 6], dtype=int64)

In [117]:
t = [w.translate(str.maketrans('', '', string.punctuation)) for w in [ess_1_model_answers[-1]]]
# t = [*set([w for w in set(doc.split()) if w not in stop_words])]
t = [*set([w for w in set(t[0].split()) if w not in stop_words])]
# cands = np.array(list(map(lambda x: np.array(cos_sim(ans_emb,BERT.model.encode(x))),cand)))

In [119]:
len(t)

33

In [120]:
ess_1_model_answers[-1]

'In order to replicate this experiment, you would need to know what the samples were. Also, step three in their procedure is a little confusing and to replicate this experiment they may want to give a little bit more directions and be more specific. They might also want to say what size containers to use or how much vinegar to add to the container to allow someone also to replicate.'

In [114]:
t

['used', 'You', 'much', 'need', 'container.', 'vinegar', 'know']

In [121]:
maximal_marginal_relevance(
        ans_emb.reshape(1, -1),BERT.model.encode(t),t,top_n=5,
        diversity=0.4)
# maximal_marginal_relevance(
#         ans_emb.reshape(1, -1),BERT.model.encode([*t]),[*t],top_n=5,
#         diversity=0.4)

['much', 'confusing', 'replicate', 'container', 'experiment']

In [52]:
cands[0].shape

(1, 1)

In [51]:
np.where(cands[0]>0.1)

(array([0], dtype=int64), array([0], dtype=int64))

short -> set stopwords (1,1) max sim top sorted 5