In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.notebook import tqdm
from nltk.tokenize import sent_tokenize
from rouge import Rouge
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.metrics import log_loss, PrecisionRecallDisplay, RocCurveDisplay
from malnis import show



In [2]:
train = pd.read_pickle("../data/split/labels_train.pkl")
show(train)

(6723, 9)


Unnamed: 0,query,document,summary,r1,r2,rl,sentences,relevance,n_sentences
1081,Recent work by Zellers et al. (2018) introduce...,Transformer-based models have pushed state of ...,"[In particular, they were shown to rely on sha...",0.258675,0.089412,0.246057,[Transformer-based models have pushed state of...,"[False, False, False, False, False, False, Fal...",436
1302,Inductive transfer learning has greatly impact...,We experiment on real world invoice and resume...,[After pre-training frommassive text data with...,0.321101,0.108696,0.302752,[We experiment on real world invoice and resum...,"[False, False, False, False, False, False, Fal...",301
7921,Traditional approaches to extractive summariza...,The increasing online information has necessit...,[(Cheng and Lapata 2016) treated single docume...,0.327684,0.144068,0.305085,[The increasing online information has necessi...,"[False, False, False, False, False, False, Fal...",241
1752,Transformer architectures show significant pro...,Contextualized word representations such as th...,[There has already been evidence that token-le...,0.285714,0.104348,0.262548,[Contextualized word representations such as t...,"[True, False, False, False, False, False, Fals...",239
7860,Gender bias is an increasingly important issue...,Gender bias is the preference or prejudice tow...,[A few interdisciplinary studies (Herbelot et ...,0.273128,0.101911,0.255507,[Gender bias is the preference or prejudice to...,"[False, False, False, False, False, False, Fal...",227


In [3]:
test = pd.read_pickle("../data/split/labels_test.pkl")
show(test)

(2242, 9)


Unnamed: 0,query,document,summary,r1,r2,rl,sentences,relevance,n_sentences
4595,Neural ranking models for information retrieva...,CCS CONCEPTS • Information systems→ Learning t...,[power in end-to-end learning relevance matchi...,0.301587,0.13079,0.285714,[CCS CONCEPTS • Information systems→ Learning ...,"[False, False, False, False, False, False, Fal...",312
962,The dominant sequence transduction models are ...,Large pre-trained language models achieve very...,"[, 2019), a large pre-trained Transformer (Vas...",0.268456,0.067797,0.255034,[Large pre-trained language models achieve ver...,"[False, False, False, False, False, False, Fal...",199
4456,"We present JEC-QA, the largest question answer...",Legal Artificial Intelligence (LegalAI) mainly...,"[Moreover, the bar exam is a professional qual...",0.338346,0.152672,0.293233,[Legal Artificial Intelligence (LegalAI) mainl...,"[False, False, False, True, False, False, Fals...",254
5362,Deep convolutional neural networks (CNNs) have...,"Index Terms—Deep learning, visual analytics, i...",[incorporated into the model understanding pro...,0.292398,0.125475,0.274854,"[Index Terms—Deep learning, visual analytics, ...","[False, False, False, False, False, False, Fal...",497
3437,Machine learning algorithms frequently require...,ar X iv :1 90 6. 02 24 3v 1 [ cs .C L ] 5 J un...,[Analysis of hyperparameter tuning has been pe...,0.224256,0.069767,0.187643,"[ar X iv :1 90 6., 02 24 3v 1 [ cs .C L ] 5 J ...","[False, False, False, False, False, True, Fals...",130


In [4]:
true_targets = np.concatenate(test.relevance.to_list())
print("true targets", true_targets.shape)

true targets (646033,)


In [5]:
records = test.to_dict("records")
len(records)

2242

# TFIDF

In [6]:
# words

predictions = []
for d in tqdm(records):
    vectorizer = TfidfVectorizer()
    sentences = d["sentences"]
#     print(len(sentences))
    features = vectorizer.fit_transform(sentences)
#     print(features.shape)
    query_emb = vectorizer.transform([d["query"]])
#     print(query_emb.shape)
    relevance = (query_emb @ features.T).toarray().squeeze()
#     print(relevance.shape)
    predictions.append(relevance)

true_predictions = np.concatenate(predictions)
print("true predictions", true_predictions.shape)

np.save("evaluation/tfidf_words.npy", true_predictions)

# fig, axes = plt.subplots(1, 2, figsize = (12, 6))

# display = PrecisionRecallDisplay.from_predictions(
#     true_targets, true_predictions, name = "TFIDF Words",
#     ax = axes[0]
# )

# display = RocCurveDisplay.from_predictions(
#     true_targets, true_predictions, name = "TFIDF Words",
#     ax = axes[1]
# )

  0%|          | 0/2242 [00:00<?, ?it/s]

true predictions (646033,)


In [7]:
# chars

predictions = []
for d in tqdm(records):
    vectorizer = TfidfVectorizer(
        analyzer = "char",
        ngram_range = (3, 3)
    )    
    sentences = d["sentences"]
#     print(len(sentences))
    features = vectorizer.fit_transform(sentences)
#     print(features.shape)
    query_emb = vectorizer.transform([d["query"]])
#     print(query_emb.shape)
    relevance = (query_emb @ features.T).toarray().squeeze()
#     print(relevance.shape)
    predictions.append(relevance)

true_predictions = np.concatenate(predictions)
print("true predictions", true_predictions.shape)

np.save("evaluation/tfidf_chars.npy", true_predictions)

# fig, axes = plt.subplots(1, 2, figsize = (12, 6))

# display = PrecisionRecallDisplay.from_predictions(
#     true_targets, true_predictions, name = "TFIDF Chars",
#     ax = axes[0]
# )

# display = RocCurveDisplay.from_predictions(
#     true_targets, true_predictions, name = "TFIDF Chars",
#     ax = axes[1]
# )

  0%|          | 0/2242 [00:00<?, ?it/s]

true predictions (646033,)


# Sentence-BERT

In [8]:
model = SentenceTransformer(
    "sbert"
#     'all-MiniLM-L6-v2', 
#     cache_folder = "../assets"
#     "../cache/huggingface/transformers/"
#     cache_folder = "../cache/huggingface/transformers"
)
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [9]:
# sentence-bert

predictions = []
for d in tqdm(records):
#     vectorizer = TfidfVectorizer(
#         analyzer = "char",
#         ngram_range = (3, 3)
#     )    
    sentences = d["sentences"]
#     print(len(sentences))
#     features = vectorizer.fit_transform(sentences)
#     print(features.shape)
    features = model.encode(sentences)
    query_emb = model.encode([d["query"]])
#     query_emb = vectorizer.transform([d["query"]])
#     print(query_emb.shape)
    relevance = (query_emb @ features.T).squeeze()
#     print(relevance.shape)
    predictions.append(relevance)

true_predictions = np.concatenate(predictions)
print("true predictions", true_predictions.shape)

np.save("evaluation/sentence_bert.npy", true_predictions)

# fig, axes = plt.subplots(1, 2, figsize = (12, 6))

# display = PrecisionRecallDisplay.from_predictions(
#     true_targets, true_predictions, name = "Sentence-BERT",
#     ax = axes[0]
# )

# display = RocCurveDisplay.from_predictions(
#     true_targets, true_predictions, name = "Sentence-BERT",
#     ax = axes[1]
# )

  0%|          | 0/2242 [00:00<?, ?it/s]

true predictions (646033,)
