In [1]:
import pandas as pd
import numpy as np
from malnis import show
from nltk.tokenize import sent_tokenize
from tqdm.notebook import tqdm
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import average_precision_score, roc_auc_score

In [2]:
data = pd.read_pickle("../data/sentence_labels.pkl")\
.reset_index(drop = True)\
[["query", "sentences", "summary"]]\
.assign(summary = lambda df: df.summary.map(lambda x: sent_tokenize(re.sub(r"\.(?=[A-Z])", ". ", x[0]))))\

show(data)

(8965, 3)


Unnamed: 0,query,sentences,summary
0,We introduce a new language representation mod...,"[KEYWORDS cascade ranking, pre-trained languag...",[Our approach is mainly based on the BERT lang...
1,The dominant sequence transduction models are ...,"[KEYWORDS cascade ranking, pre-trained languag...",[BERT [2] is a self-supervised approach for pr...
2,Language model pretraining has led to signific...,"[KEYWORDS cascade ranking, pre-trained languag...","[Recently, some variants [4, 12] of BERT langu..."
3,With the capability of modeling bidirectional ...,"[KEYWORDS cascade ranking, pre-trained languag...","[Recently, some variants [4, 12] of BERT langu..."
4,Neural sequence-to-sequence models have provid...,"[KEYWORDS cascade ranking, pre-trained languag...",[The proposed model is based on the pointer-ge...


In [3]:
df = data.sentences#.head(10)
clean_sentences = [[re.sub(r"\W", "", x.lower()) for x in l] for l in tqdm(df)]
clean_sentences[0][:10]

  0%|          | 0/8965 [00:00<?, ?it/s]

['keywordscascaderankingpretrainedlanguagemodeldocumentexpansionsequencetosequencegenerationacmreferenceformatmingyanchenlianglichenwubinbiweiwangjiangnanxialuosi',
 '2020',
 'idstattrec2019deeplearningtrackdeepcascaderankingwithgenerationbaseddocumentexpansionandpretrainedlanguagemodeling',
 'inproceedingsofacmconferenceconference17',
 'acmnewyorknyusa4pages',
 'httpsdoiorg101145nnnnnnnnnnnnnnthedeeplearningtrackisanewtrackfirstrunintrec2019whichaimsatstudyinginformationretrievalinalargetrainingdataregime',
 'itconsistsoftwotaskspassagerankinganddocumentranking',
 'bothtasksusealargehumangeneratedsetoftraininglabelsfromthemsmarco1dataset',
 'thepassagerankingtaskfocusesonrankingpassageswhereitcontains1010916queriesonacollectionof8841823passages',
 'thedocumentrankingtaskis1httpwwwmsmarcoorgpermissiontomakedigitalorhardcopiesofallorpartofthisworkforpersonalorclassroomuseisgrantedwithoutfeeprovidedthatcopiesarenotmadeordistributedforprofitorcommercialadvantageandthatcopiesbearthisnotice

In [4]:
data.summary.map(len).value_counts()

1     6007
2     1602
3      573
4      288
5      151
6       96
7       77
8       36
9       35
11      21
10      19
12       9
13       8
14       6
15       5
16       4
18       4
20       2
30       2
19       2
32       2
21       2
28       2
49       1
71       1
69       1
25       1
24       1
31       1
23       1
17       1
42       1
74       1
84       1
54       1
Name: summary, dtype: int64

In [5]:
df = data.summary#.head(10)
clean_summary = [[re.sub(r"\W", "", x.lower()) for x in l] for l in tqdm(df)]
clean_summary[0][:10]

  0%|          | 0/8965 [00:00<?, ?it/s]

['ourapproachismainlybasedonthebertlanguagemodel2whichisastateoftheartmodelinvariousnaturallanguageunderstandingtasks',
 'bert2isaselfsupervisedapproachforpretrainingadeeptransformerencoder8beforefinetuningitforaparticulardownstreamtask']

In [6]:
pd.Series(clean_summary).map(len).value_counts()

1     6007
2     1602
3      573
4      288
5      151
6       96
7       77
8       36
9       35
11      21
10      19
12       9
13       8
14       6
15       5
16       4
18       4
20       2
30       2
19       2
32       2
21       2
28       2
49       1
71       1
69       1
25       1
24       1
31       1
23       1
17       1
42       1
74       1
84       1
54       1
dtype: int64

In [7]:
data.sentences.map(len).sum()

3173352

In [8]:
relevance = [
    [True if s in summ else False for s in sents] 
    for sents, summ in zip(tqdm(clean_sentences), clean_summary)
]
sum([len(x) for x in relevance])

  0%|          | 0/8965 [00:00<?, ?it/s]

3173352

In [9]:
data = data.assign(relevance = relevance)
show(data)

(8965, 4)


Unnamed: 0,query,sentences,summary,relevance
0,We introduce a new language representation mod...,"[KEYWORDS cascade ranking, pre-trained languag...",[Our approach is mainly based on the BERT lang...,"[False, False, False, False, False, False, Fal..."
1,The dominant sequence transduction models are ...,"[KEYWORDS cascade ranking, pre-trained languag...",[BERT [2] is a self-supervised approach for pr...,"[False, False, False, False, False, False, Fal..."
2,Language model pretraining has led to signific...,"[KEYWORDS cascade ranking, pre-trained languag...","[Recently, some variants [4, 12] of BERT langu...","[False, False, False, False, False, False, Fal..."
3,With the capability of modeling bidirectional ...,"[KEYWORDS cascade ranking, pre-trained languag...","[Recently, some variants [4, 12] of BERT langu...","[False, False, False, False, False, False, Fal..."
4,Neural sequence-to-sequence models have provid...,"[KEYWORDS cascade ranking, pre-trained languag...",[The proposed model is based on the pointer-ge...,"[False, False, False, False, False, False, Fal..."


In [10]:
all(data.relevance.map(len) == data.sentences.map(len))

True

In [11]:
data.relevance.map(any).value_counts()

True     6292
False    2673
Name: relevance, dtype: int64

In [12]:
data.relevance.map(any).sort_values()

1816    False
2241    False
2242    False
2243    False
2244    False
        ...  
3630     True
3629     True
3628     True
3641     True
8964     True
Name: relevance, Length: 8965, dtype: bool

In [13]:
filt = data.relevance.map(any)

In [14]:
new = data.loc[filt]
show(new)

(6292, 4)


Unnamed: 0,query,sentences,summary,relevance
0,We introduce a new language representation mod...,"[KEYWORDS cascade ranking, pre-trained languag...",[Our approach is mainly based on the BERT lang...,"[False, False, False, False, False, False, Fal..."
1,The dominant sequence transduction models are ...,"[KEYWORDS cascade ranking, pre-trained languag...",[BERT [2] is a self-supervised approach for pr...,"[False, False, False, False, False, False, Fal..."
2,Language model pretraining has led to signific...,"[KEYWORDS cascade ranking, pre-trained languag...","[Recently, some variants [4, 12] of BERT langu...","[False, False, False, False, False, False, Fal..."
3,With the capability of modeling bidirectional ...,"[KEYWORDS cascade ranking, pre-trained languag...","[Recently, some variants [4, 12] of BERT langu...","[False, False, False, False, False, False, Fal..."
4,Neural sequence-to-sequence models have provid...,"[KEYWORDS cascade ranking, pre-trained languag...",[The proposed model is based on the pointer-ge...,"[False, False, False, False, False, False, Fal..."


In [15]:
train, test = train_test_split(new)
train.shape, test.shape

((4719, 4), (1573, 4))

In [16]:
rel = np.concatenate(test.relevance.tolist())
rel.shape

(557897,)

In [18]:
%%time

corpus = train.sentences.sum()
len(corpus)

CPU times: user 46.7 s, sys: 169 ms, total: 46.9 s
Wall time: 46.9 s


1754236

# words

In [19]:
vectorizer_words = TfidfVectorizer()
vectorizer_words.fit(corpus)

In [20]:
sentence_features_words = [vectorizer_words.transform(s) for s in tqdm(test.sentences)]
sentence_features_words[0].shape

  0%|          | 0/1573 [00:00<?, ?it/s]

(232, 65457)

In [None]:
# summary_features_words = [vectorizer_words.transform(s) for s in tqdm(test.summary)] 
# summary_features_words[0].shape

In [None]:
# scores_words = [
#     (sents @ summ.T).max(1).toarray().squeeze() 
#     for sents, summ in zip(tqdm(sentence_features_words), summary_features_words)
# ]
# scores_words[0].shape

In [22]:
query_features_words = [vectorizer_words.transform([s]) for s in tqdm(test["query"])]

len(query_features_words), query_features_words[0].shape 

  0%|          | 0/1573 [00:00<?, ?it/s]

(1573, (1, 65457))

In [24]:
scores_words = [
    (sents @ q.T).toarray().squeeze() 
    for sents, q in zip(tqdm(sentence_features_words), query_features_words)
]
len(scores_words), scores_words[0].shape

  0%|          | 0/1573 [00:00<?, ?it/s]

(1573, (232,))

In [25]:
test.sentences.map(len).sum()

557897

In [26]:
final_scores_words = np.concatenate(scores_words)
final_scores_words.shape

(557897,)

In [27]:
average_precision_score(rel, final_scores_words)

0.035980217775553235

In [28]:
roc_auc_score(rel, final_scores_words)

0.7299003529790372

# chars

In [30]:
vectorizer_chars = TfidfVectorizer(analyzer = "char", ngram_range = (3, 3))
vectorizer_chars.fit(corpus)

In [31]:
sentence_features_chars = [vectorizer_chars.transform(s) for s in tqdm(test.sentences)]
sentence_features_chars[0].shape

  0%|          | 0/1573 [00:00<?, ?it/s]

(232, 98972)

In [None]:
# summary_features_chars = [vectorizer_chars.transform(s) for s in tqdm(test.summary)] 
# summary_features_chars[0].shape

In [None]:
# scores_chars = [
#     (sents @ summ.T).max(1).toarray().squeeze() 
#     for sents, summ in zip(tqdm(sentence_features_chars), summary_features_chars)
# ]
# scores_chars[0].shape

In [32]:
query_features_chars = [vectorizer_chars.transform([s]) for s in tqdm(test["query"])] 
query_features_chars[0].shape

  0%|          | 0/1573 [00:00<?, ?it/s]

(1, 98972)

In [34]:
scores_chars = [
    (sents @ q.T).toarray().squeeze() 
    for sents, q in zip(tqdm(sentence_features_chars), query_features_chars)
]
scores_chars[0].shape

  0%|          | 0/1573 [00:00<?, ?it/s]

(232,)

In [35]:
test.sentences.map(len).sum()

557897

In [36]:
final_scores_chars = np.concatenate(scores_chars)
final_scores_chars.shape

(557897,)

In [37]:
average_precision_score(rel, final_scores_chars)

0.024109982269947507

In [38]:
roc_auc_score(rel, final_scores_chars)

0.7037490476348061

# queries

In [39]:
i = 10
list(zip(test.relevance.tolist()[i], scores_words[i].round(2)))

[(False, 0.19),
 (False, 0.06),
 (False, 0.0),
 (False, 0.03),
 (False, 0.12),
 (False, 0.09),
 (False, 0.13),
 (False, 0.05),
 (False, 0.07),
 (False, 0.02),
 (False, 0.01),
 (False, 0.02),
 (False, 0.03),
 (False, 0.05),
 (False, 0.08),
 (False, 0.05),
 (False, 0.02),
 (True, 0.17),
 (False, 0.15),
 (False, 0.15),
 (False, 0.07),
 (False, 0.13),
 (False, 0.08),
 (False, 0.07),
 (False, 0.18),
 (False, 0.22),
 (False, 0.12),
 (False, 0.26),
 (False, 0.17),
 (False, 0.17),
 (False, 0.12),
 (False, 0.04),
 (False, 0.01),
 (False, 0.02),
 (False, 0.13),
 (False, 0.03),
 (False, 0.04),
 (False, 0.14),
 (False, 0.04),
 (False, 0.04),
 (False, 0.08),
 (False, 0.03),
 (False, 0.09),
 (False, 0.05),
 (False, 0.13),
 (False, 0.06),
 (False, 0.13),
 (False, 0.04),
 (False, 0.2),
 (False, 0.04),
 (False, 0.05),
 (False, 0.12),
 (False, 0.23),
 (False, 0.11),
 (False, 0.13),
 (False, 0.12),
 (False, 0.15),
 (False, 0.04),
 (False, 0.04),
 (False, 0.1),
 (False, 0.11),
 (False, 0.05),
 (False, 0.1

In [40]:
i = 30
list(zip(test.relevance.tolist()[i], scores_chars[i].round(2)))

[(False, 0.22),
 (False, 0.08),
 (False, 0.15),
 (False, 0.13),
 (False, 0.2),
 (False, 0.09),
 (False, 0.08),
 (False, 0.15),
 (False, 0.06),
 (False, 0.03),
 (False, 0.0),
 (False, 0.05),
 (False, 0.09),
 (False, 0.17),
 (False, 0.11),
 (False, 0.12),
 (False, 0.27),
 (False, 0.0),
 (False, 0.13),
 (False, 0.27),
 (False, 0.14),
 (False, 0.16),
 (True, 0.18),
 (False, 0.19),
 (False, 0.31),
 (False, 0.12),
 (False, 0.02),
 (False, 0.13),
 (False, 0.03),
 (False, 0.15),
 (False, 0.13),
 (False, 0.15),
 (False, 0.06),
 (False, 0.15),
 (False, 0.21),
 (False, 0.05),
 (False, 0.09),
 (False, 0.13),
 (False, 0.17),
 (False, 0.25),
 (False, 0.31),
 (False, 0.13),
 (False, 0.22),
 (False, 0.19),
 (False, 0.04),
 (False, 0.12),
 (False, 0.12),
 (False, 0.09),
 (False, 0.11),
 (False, 0.15),
 (False, 0.17),
 (False, 0.12),
 (False, 0.08),
 (False, 0.02),
 (False, 0.17),
 (False, 0.21),
 (False, 0.16),
 (False, 0.13),
 (False, 0.14),
 (False, 0.13),
 (False, 0.11),
 (False, 0.17),
 (False, 0.1

In [41]:
any(relevance[30])

False