In [1]:
import time
import os
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import key_words
from sentence_transformers.util import cos_sim

import pickle
def save_obj(obj:object,name:str):
    ext = '.pickle'
    with open(name + ext, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_obj(name:str)->object:
    ext = '.pickle'
    with open(name + ext, 'rb') as handle:
        return pickle.load(handle)

In [2]:
ess_1_model_answers = load_obj("data/essaySet_1_model_answers")
ess_2_model_answers = load_obj("data/essaySet_2_model_answers")
ess_3_model_answers = load_obj("data/essaySet_3_model_answers")
ess_4_model_answers = load_obj("data/essaySet_4_model_answers")
ess_5_model_answers = load_obj("data/essaySet_5_model_answers")
ess_6_model_answers = load_obj("data/essaySet_6_model_answers")
ess_7_model_answers = load_obj("data/essaySet_7_model_answers")
ess_8_model_answers = load_obj("data/essaySet_8_model_answers")
ess_9_model_answers = load_obj("data/essaySet_9_model_answers")
ess_10_model_answers = load_obj("data/essaySet_10_model_answers")

In [4]:
def maximal_marginal_relevance(doc_embedding: np.ndarray,
        word_embeddings: np.ndarray,
        words,
        top_n = 5,
        diversity = 0.8):
    """
    Maximal Marginal Relevance algorithm for keyword extraction
    * from KeyBERT repository on github

    Args:
        doc_embedding (numpy.ndarray): embedding of shape (1, 768)
        word_embeddings (numpy.ndarray): embedding of shape (N, 768)
        words (List[str]): list of words
        top_n (Optional[int]): number of top words to extract
        diversity (Optional[float]): diversity of top words to extract

    Returns:
        List[Tuple[str, float]]: list of top_n words with their scores
    """
    # make sure 2d array
    if doc_embedding.ndim == 1:
        doc_embedding = doc_embedding.reshape(1, -1)

    # Extract similarity within words, and between words and the document
    # word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
    # word_similarity = cosine_similarity(word_embeddings)

    word_doc_similarity = np.array(cos_sim(word_embeddings, doc_embedding)).clip(-1, 1).round(6)
    word_similarity = np.array(cos_sim(word_embeddings, word_embeddings)).clip(-1, 1).round(6)

    # Initialize candidates and already choose best keyword/keyphras
    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        # Extract similarities within candidates and
        # between candidates and selected keywords/phrases
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # Calculate maximal_marginal_relevance
        mmr = (1-diversity) * candidate_similarities -\
            diversity * target_similarities.reshape(-1, 1)
        # if return mmr is empty
        if mmr.size == 0:
            continue
        mmr = candidates_idx[np.argmax(mmr)]

        # Update keywords & candidates
        keywords_idx.append(mmr)
        candidates_idx.remove(mmr)
    return [words[idx] for idx in keywords_idx]


In [5]:
def compare_keys_div(ind,essay,top_n=5,diversities=[0.8]):
    for div in diversities:
        keywords = list(map(lambda x: maximal_marginal_relevance(x[0].reshape(1, -1),x[1],x[2],top_n=top_n,diversity=div),
            zip(emb_dict[f'ess_{essay}_model_answers_emb'],
            model_candidate_emb,model_candidates)))
        print(div,keywords[ind])

In [6]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

In [7]:
def exact_f1_k(assigned, extracted, k):
    """
    Computes the exatch match f1 measure at k.
    Arguments
    ---------
    assigned  : A list of human assigned keyphrases.
    extracted : A list of extracted keyphrases.
    k         : int
                The maximum number of extracted keyphrases.
    Returned value
    --------------
              : double
    """
    # Exit early, if one of the lists or both are empty.
    if not assigned or not extracted:
        return 0.0

    precision_k = len(set(assigned) & set(extracted)) / k
    recall_k = len(set(assigned) & set(extracted)) / len(assigned)
    return (
        2 * precision_k * recall_k / (precision_k + recall_k)
        if precision_k and recall_k else 0.0
    )

def partial_f1_k(assigned, extracted, k):
    """
    Computes the exatch match f1 measure at k.
    Arguments
    ---------
    assigned  : A list of human assigned keyphrases.
    extracted : A list of extracted keyphrases.
    k         : int
                The maximum number of extracted keyphrases.
    Returned value
    --------------
              : double
    """
    # Exit early, if one of the lists or both are empty.
    if not assigned or not extracted:
        return 0.0

    # Store the longest keyphrases first.
    assigned_sets = sorted([set(keyword.split()) for keyword in assigned], key = len, reverse = True)
    extracted_sets = sorted([set(keyword.split()) for keyword in extracted], key = len, reverse = True)

    # This list stores True, if the assigned keyphrase has been matched earlier.
    # To avoid counting duplicate matches.
    assigned_matches = [False for assigned_set in assigned_sets]

    # For each extracted keyphrase, find the closest match, 
    # which is the assigned keyphrase it has the most words in common.
    for extracted_set in extracted_sets:
        all_matches = [(i, len(assigned_set & extracted_set)) for i, assigned_set in enumerate(assigned_sets)]
        closest_match = sorted(all_matches, key = lambda x: x[1], reverse = True)[0]
        assigned_matches[closest_match[0]] = True

    # Calculate the precision and recall metrics based on the partial matches.
    partial_matches = assigned_matches.count(True)  
    precision_k = partial_matches / k
    recall_k = partial_matches / len(assigned)
    
    return (
        2 * precision_k * recall_k / (precision_k + recall_k)
        if precision_k and recall_k else 0.0
    )

def f1_metric_k(assigned, extracted, k, partial_match = True):
    """
    Wrapper function that calculates either the exact
    or the partial match f1 metric.
    """
    return (
        partial_f1_k(assigned, extracted, k) 
        if partial_match else exact_f1_k(assigned, extracted, k)
    )

In [12]:
# assigned = ['replicate experiment need', 'containers use vinegar', 'bit directions specific', 'step procedure little', 'know samples step', 'say size', 'little confusing', 'experiment want little', 'container allow replicate']
assigned = ["know what the samples", "little bit more directions and be more specific" , "size containers", "how much vinegar to add"]
extracted = ['replicate experiment need', 'want say size', 'containers use vinegar', 'procedure little', 'bit directions specific', 'need know', 'know samples step', 'add container allow', 'want little bit']
print(len(extracted))
# extracted = [ "know what the samples", "little bit more directions and be more specific", "size containers", "how much vinegar to add",]
f1_metric_k(assigned, extracted, 9)

9


0.6153846153846153

In [73]:
def keys_score(assigned,essay,diversities=np.arange(0.3,1,0.05).round(3).tolist(),top_n=10):
    f1_metric_k_scores = {}
    model_candidates = key_words.candidates_tokens(essay,n_gram_range=(2,3))
    model_candidate_emb = list(map(lambda cand: model.encode(cand),model_candidates))
    for div in diversities:
        keywords = maximal_marginal_relevance(model.encode(essay).reshape(1, -1),model_candidate_emb,model_candidates,top_n=top_n,diversity=div)
        f1_metric_k_scores[div] = round(f1_metric_k(assigned,keywords,len(keywords),partial_match=True),3)
    return f1_metric_k_scores

In [74]:
assigned_2 = ["need to determine the mass of four different samples", "name and list the samples that they plan on using",
"say to pour vinegar","how much vinegar we should pour", "rinse each sample with distilled water",
"how much water should we use and how long should we rinse."]

In [89]:
def flatten(ls_ls):
    out = []
    for ls in ls_ls:
        if isinstance(ls, list):
            out.append(ls[0])
        else:
            out.append(ls)
    return out

def score_f1(assigned,essays):
    f1_metric_k_scores = []
    for i in essays:
        ess = keys_score(assigned,i)
        # get max 3 values
        scores = sorted(ess.items(), key=lambda x: x[1], reverse=True)[:2]
        f1_metric_k_scores.append(scores)
    
    f1_metric_k_scores = flatten(f1_metric_k_scores)

    return f1_metric_k_scores

In [86]:
out = score_f1(assigned_2,ess_1_model_answers)

In [90]:
list(filter(lambda x: x[1] > 0.5, set(out)))

[(0.5, 0.625), (0.7, 0.625), (0.55, 0.75), (0.3, 0.625), (0.45, 0.625)]

In [91]:
assigned_9 =["has a very effective way of organizing the the article", "Once grabbing the readers attention","Space Junk?","Crash Course"]
out_9 = score_f1(assigned_9,ess_9_model_answers)
out_9

[(0.4, 0.286),
 (0.4, 0.286),
 (0.3, 0.143),
 (0.3, 0.143),
 (0.3, 0.143),
 (0.55, 0.286),
 (0.3, 0.143),
 (0.3, 0.143),
 (0.4, 0.286),
 (0.3, 0.143),
 (0.3, 0.143),
 (0.3, 0.143),
 (0.3, 0.143),
 (0.3, 0.143),
 (0.3, 0.143),
 (0.3, 0.143),
 (0.3, 0.143),
 (0.3, 0.143),
 (0.3, 0.143),
 (0.3, 0.143),
 (0.3, 0.286),
 (0.3, 0.286)]

In [92]:
list(filter(lambda x: x[1] > 0.5, set(out_9)))

[]

In [93]:
assigned__3 =["know the size","samples are put in","know tha amount of samples",
"to put in each container","obtain tha same starting mass","Another factor amount no vinegar",
"location where the samples are drying and amount of sunlight."]
out_3 = score_f1(assigned__3,ess_1_model_answers)
out_3

[(0.3, 0.429),
 (0.3, 0.375),
 (0.3, 0.333),
 (0.3, 0.353),
 (0.3, 0.235),
 (0.3, 0.286),
 (0.3, 0.375),
 (0.3, 0.353),
 (0.3, 0.353),
 (0.5, 0.353),
 (0.4, 0.353),
 (0.3, 0.471),
 (0.55, 0.471),
 (0.3, 0.353),
 (0.3, 0.353),
 (0.3, 0.235),
 (0.3, 0.471),
 (0.35, 0.353),
 (0.3, 0.588),
 (0.45, 0.588),
 (0.35, 0.353),
 (0.45, 0.588)]

In [94]:
list(filter(lambda x: x[1] > 0.5, set(out_3)))

[(0.3, 0.588), (0.45, 0.588)]

In [96]:
ess_4_model_answers[1]

'The word "invasive" helps to create a debate in the article. Invasive species are animals that are introduced into an envoriment and thrive in it possible affecting other animals. The scientist feels the term "invasive species is unfair", referring to pythons. Biologists, however feel invasive species are major threats to biodiversity. The word "invasive" helps to provide debate on this article.'

In [97]:
assigned_4 = ["\"invasive\"", "helps to create a debate","invasive species is unfair",
    "referring to pythons."," Biologists","threats to biodiversity","an envoriment and thrive"
    ,"envoriment thrive","affecting other animals" ]
out_4 = score_f1(assigned_4,ess_4_model_answers)
out_4

[(0.45, 0.316),
 (0.6, 0.632),
 (0.3, 0.211),
 (0.3, 0.211),
 (0.3, 0.316),
 (0.3, 0.211),
 (0.3, 0.211),
 (0.3, 0.211),
 (0.3, 0.211),
 (0.3, 0.211),
 (0.35, 0.421),
 (0.3, 0.211),
 (0.3, 0.211),
 (0.3, 0.316),
 (0.65, 0.316),
 (0.3, 0.211),
 (0.3, 0.211),
 (0.35, 0.316),
 (0.4, 0.316),
 (0.4, 0.316),
 (0.3, 0.211),
 (0.45, 0.421)]

In [98]:
list(filter(lambda x: x[1] > 0.5, set(out_4)))

[(0.6, 0.632)]

# SET DIVERSITY TO 0.65