In [3]:
import pyterrier as pt

if not pt.started():
    pt.init(tqdm="notebook")

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [4]:
import nltk

nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/irtaza.hashmi@futurice.com/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

We'll use the `nfcorpus` dataset again, as before. In this notebook, we'll use a subset of the queries (`nontopic`). The only reason for this is that it makes the computations faster.


In [5]:
dataset = pt.get_dataset("irds:nfcorpus")

In [6]:
from gensim.models import Word2Vec
from pathlib import Path

#DATASET = pt.datasets.get_dataset("irds:antique/test/non-offensive")
DATASET = pt.get_dataset('irds:antique/train/split200-train')

IDX_PATH = Path("index").absolute()
if not (IDX_PATH / "data.properties").is_file():
    pt.index.IterDictIndexer(
        str(IDX_PATH),
        meta={
            "docno": 32,
            "text": 131072,
        },
    ).index(DATASET.get_corpus_iter())

## Word2Vec


In [7]:
# used for ranking - original corpus
tokenized_dict = {}
for doc in DATASET.get_corpus_iter():
  tokenized_dict[doc['docno']] = doc['text']

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

In [8]:
from gensim.models import KeyedVectors
import numpy as np
from numpy import dot
from numpy.linalg import norm

def cosine_similarity(a, b):
    """
    Compute the cosine similarity between two vectors.
    """
    return dot(a, b) / (norm(a) * norm(b))

def pad_vectors(vec1, vec2):
    """
    Pad the shorter vector with zeros to match the length of the longer vector.
    """
    len_diff = len(vec1) - len(vec2)
    if len_diff > 0:
        vec2 = np.pad(vec2, (0, len_diff))
    elif len_diff < 0:
        vec1 = np.pad(vec1, (0, -len_diff))
    return vec1, vec2


def vectorize_document(document, model):
    """
    Convert a document into a vector by averaging the vectors of its words.
    """
    words = document.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)


def rank_documents(query, documents_dict, model):
    """
    Rank documents based on their similarity to the query.
    """
    query_vector = vectorize_document(query, model)
    # document_vectors = [vectorize_document(doc, model) for doc in documents]
    document_vectors = {}

    for key, value in documents_dict.items():
      document_vectors[key] = vectorize_document(value, model)
    #q, d = pad_vectors(query_vector, document_vectors)
    # similarities = [np.dot(pad_vectors(query_vector, doc_vector)) for doc_vector in document_vectors]
    sim = {}
    for key, value in document_vectors.items():

      q, d = pad_vectors(query_vector, value)
      sim[key] = cosine_similarity(q, d)

    # ranked_indices = np.argsort(sim)[::-1]
    return sorted(sim.items(), key=lambda item: item[1], reverse=True)

In [9]:
from nltk.tokenize import word_tokenize

# get text
corpus = [doc['text'] for doc in DATASET.get_corpus_iter()]
# tokenize
tokenized_corpus = [word_tokenize(doc) for doc in corpus]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# train the model
tfidf_model = TfidfVectorizer()
tfidf_vector = tfidf_model.fit_transform(corpus)

In [14]:
# calculate idf per word
idf_per_word = dict(zip(tfidf_model.get_feature_names_out(), tfidf_model.idf_))

# filter based on threshold
threshold = 0.5

filtered_corpus = []
for doc in tokenized_corpus:
  filtered_doc = []
  for word in doc:
    if word in idf_per_word:
      if idf_per_word[word] >= threshold:
        filtered_doc.append(word)
  filtered_corpus.append(filtered_doc)

In [15]:
# preprocessing
# remove stop words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
filtered_corpus = [[word for word in doc if word not in stop_words] for doc in filtered_corpus]

In [16]:
len(filtered_corpus)

403666

In [17]:
epochs = 25
# Dimensionality of the feature vectors
vector_size = 100 #300
# The maximum distance between the current and predicted word within a sentence
window = 5
# Ignores all words with total absolute frequency lower than this
mincount = 1

model = Word2Vec(filtered_corpus, vector_size=vector_size, window=window, min_count=mincount, sg=1)
model.train(filtered_corpus, total_examples=len(filtered_corpus), epochs=epochs)

(168273716, 173519175)

In [18]:
# save the model
model_name = "word2vec_0.5_stopwords.model"
model.save(model_name)

In [4]:
# load the model
folder = "models"
model = Word2Vec.load(f"{folder}/word2vec_0.2_stopwords.model")

NameError: name 'Word2Vec' is not defined

In [16]:
import pandas as pd

# Example usage
query = "what causes severe swelling and pain in the knees" # qid=3097310
N_TOP = 100

ranked_documents = rank_documents(query, tokenized_dict, model)
word2vec_df = pd.DataFrame(columns=['docno', 'text', 'rank', 'score', 'query'])


for i, (key, value) in enumerate(ranked_documents):
  if (i > N_TOP):
    break
  for doc in DATASET.get_corpus_iter():

    if (doc['docno'] == key):
      new_row = {'docno': doc['docno'], 'text': doc['text'], 'rank': i, 'score': value, 'query': query}
      new_row = pd.DataFrame([new_row])

      # word2vec_df = pd.concat([word2vec_df, d], ignore_index=True)
      word2vec_df = pd.concat([word2vec_df, new_row], ignore_index=True)
      break

word2vec_df

  return dot(a, b) / (norm(a) * norm(b))


antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

  word2vec_df = pd.concat([word2vec_df, new_row], ignore_index=True)


antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-train documents:   0%|          | 0/403666 [00:00<?, ?it/s]

Unnamed: 0,docno,text,rank,score,query
0,2308262_1,Experimentally induced central hyperalgesia: l...,0,0.761387,what causes severe swelling and pain in the knees
1,2308397_2,a fever can mean the difference between a cold...,1,0.750863,what causes severe swelling and pain in the knees
2,2308397_1,"Any illness with those symptoms, especially fl...",2,0.747790,what causes severe swelling and pain in the knees
3,2519839_0,Premenstrual syndrome (PMS) is a group of symp...,3,0.735604,what causes severe swelling and pain in the knees
4,4364967_0,Yawning squeezes the facial muscles around the...,4,0.727343,what causes severe swelling and pain in the knees
...,...,...,...,...,...
96,176362_2,"Look to your right, look to your left two of y...",96,0.516192,what causes severe swelling and pain in the knees
97,1904608_1,Eggnog has been the traditional Christmas drin...,97,0.514110,what causes severe swelling and pain in the knees
98,2142220_0,Knock on the watermelon (just like you would k...,98,0.510388,what causes severe swelling and pain in the knees
99,850514_1,"Beauty / cuteness is extremely subjective, but...",99,0.505584,what causes severe swelling and pain in the knees


### Get evaluation dataset

In [22]:
def pad_vectors(vec1, vec2):
    """
    Pad the shorter vector with zeros to match the length of the longer vector.
    """
    len_diff = len(vec1) - len(vec2)
    if len_diff > 0:
        vec2 = np.pad(vec2, (0, len_diff))
    elif len_diff < 0:
        vec1 = np.pad(vec1, (0, -len_diff))
    return vec1, vec2


def vectorize_document(document, model):
    """
    Convert a document into a vector by averaging the vectors of its words.
    """
    words = document.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

## updated rank documents that makes ranking faster
def rank_documents(query, document_vectors, model):
    """
    Rank documents based on their similarity to the query.
    """
    query_vector = vectorize_document(query, model)
    # document_vectors = [vectorize_document(doc, model) for doc in documents]
    # document_vectors = {}

    # for key, value in documents_dict.items():
    #   document_vectors[key] = vectorize_document(value, model)
    #q, d = pad_vectors(query_vector, document_vectors)
    # similarities = [np.dot(pad_vectors(query_vector, doc_vector)) for doc_vector in document_vectors]
    sim = {}
    for key, value in document_vectors.items():

      q, d = pad_vectors(query_vector, value)
      sim[key] = cosine_similarity(q, d)

    # ranked_indices = np.argsort(sim)[::-1]
    return sorted(sim.items(), key=lambda item: item[1], reverse=True)

In [31]:
# load the model
folder = "models"
model = Word2Vec.load(f"{folder}/word2vec_0.5_stopwords.model")

In [23]:
# load the model
#model = Word2Vec.load(f"word2vec_0.5_stopwords.model")

In [24]:
DATASET = pt.get_dataset('irds:antique/train/split200-valid')

f = list(DATASET.get_corpus_iter())

antique/train/split200-valid documents:   0%|          | 0/403666 [00:00<?, ?it/s]

In [25]:
tokenized_dict = {}
for doc in DATASET.get_corpus_iter():
  tokenized_dict[doc['docno']] = doc['text']

antique/train/split200-valid documents:   0%|          | 0/403666 [00:00<?, ?it/s]

In [26]:
data_dict = {} ####creates a dictionary of thew corpus making it faster to access

for L in f:
  data_dict[L['docno']] = L['text']

In [27]:
document_vectors_1 = {}
for key, value in tokenized_dict.items():
      document_vectors_1[key] = vectorize_document(value, model)

document_vectors_2 = {}
for key, value in tokenized_dict.items():
      document_vectors_2[key] = vectorize_document(value, model)

tokenized_dict = {}
for doc in DATASET.get_corpus_iter():
  tokenized_dict[doc['docno']] = doc['text']

data = list(DATASET.get_corpus_iter())

antique/train/split200-valid documents:   0%|          | 0/403666 [00:00<?, ?it/s]

antique/train/split200-valid documents:   0%|          | 0/403666 [00:00<?, ?it/s]

In [28]:
antique_test = DATASET.get_topics()
antique_test

Unnamed: 0,qid,query
0,1907320,how do i get college money
1,3884772,how about a bumper sticker thats says
2,348777,why would a child s psychologist recommend hos...
3,397709,how and what court should i take a driver who ...
4,1398838,what does crunching numbers mean
...,...,...
195,993601,why were black women abused sexually after the...
196,4069320,athiests how did you find the light
197,2897078,how to get rid of stript throat
198,1034050,why does the earth keep getting hotter and hot...


In [29]:
import pandas as pd

# Example usage
# query = "what causes severe swelling and pain in the knees" #qid=3097310
N_TOP = 499
word2vec_df = pd.DataFrame(columns=['docno', 'text', 'rank', 'score', 'query'])
for key, query in antique_test.iterrows():
  print(key)
  
  ranked_documents = rank_documents(query['query'], document_vectors_1, model)

  for i, (key, value) in enumerate(ranked_documents):
    if (i > N_TOP):
      break

    new_row = {'docno': key, 'text': data_dict[key], 'rank': i, 'score': value, 'query': query['query'], 'qid': query['qid']}
    new_row = pd.DataFrame([new_row])

    # word2vec_df = pd.concat([word2vec_df, d], ignore_index=True)
    word2vec_df = pd.concat([word2vec_df, new_row], ignore_index=True)


word2vec_df

0


  return dot(a, b) / (norm(a) * norm(b))
  word2vec_df = pd.concat([word2vec_df, new_row], ignore_index=True)
  return dot(a, b) / (norm(a) * norm(b))


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199


Unnamed: 0,docno,text,rank,score,query,qid
0,176362_0,never heard of a college of kicking out a hard...,0,0.838133,how do i get college money,1907320
1,2175566_1,Most colleges look at GPA and class rank. Als...,1,0.786512,how do i get college money,1907320
2,638201_6,When we were looking to purchase our business....,2,0.776339,how do i get college money,1907320
3,2175566_0,I used to work in a college admissions office ...,3,0.772424,how do i get college money,1907320
4,3039702_12,"well as of right now, if you ""reenlist"" in Ira...",4,0.764197,how do i get college money,1907320
...,...,...,...,...,...,...
99995,1781052_1,"Furthermore, I'd like to talk about getting fa...",495,0.602334,how did african american women get the right t...,2573745
99996,760180_3,You probably need new brake shoes on the front...,496,0.597153,how did african american women get the right t...,2573745
99997,2847170_2,I'm trying to avoid sneaking a few candies too...,497,0.591593,how did african american women get the right t...,2573745
99998,812049_3,I used to be a Missionary for the Church. Ther...,498,0.590357,how did african american women get the right t...,2573745


In [30]:
# save as csv
word2vec_df.to_csv('word2vec_05_test_results.csv', index=False)