# Libraries and Data

In [1]:
# Import libraries
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
# Download some models from nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Sample documents about sailing in Croatia
documents = ['Sailing in Croatia often includes visiting UNESCO World Heritage sites.',
 'The Makarska Riviera is known for its stunning coastline and sailing opportunities.',
 'Sailing in Croatia offers stunning views of the Adriatic Sea.',
 "The city of Sibenik is home to the impressive St. James's Cathedral, a UNESCO World Heritage site.",
 'The island of Brač is known for its beautiful beaches and great sailing conditions.',
 'Sailing to the island of Rab, known for its medieval old town, is a great experience.',
 'The Pakleni Islands near Hvar are a popular spot for sailing yachts.',
 'The island of Cres is one of the largest in Croatia and a great destination for sailing.',
 'The island of Cres is one of the largest in Croatia and has a diverse wildlife.',
 'The city of Zagreb is the capital of Croatia and offers a mix of modern and historic attractions.',
 'Sailors can experience the traditional Dalmatian way of life in many coastal villages.',
 'The city of Knin is known for its historic fortress and beautiful scenery.',
 'The island of Krk is the largest island in the Adriatic Sea.',
 'Croatia has a rich history dating back to the Roman Empire.',
 'The island of Hvar is a popular destination for celebrities and high-end travelers.',
 'The city of Varazdin is known for its baroque buildings and vibrant cultural scene.',
 'Sailors can enjoy fresh seafood at many coastal restaurants in Croatia.',
 'The coastal town of Senj is known for its carnival and Nehaj Fortress.',
 'The Dalmatian Coast is a famous region for sailing in Croatia.',
 "The Diocletian's Palace in Split is one of the most famous Roman ruins in Croatia.",
 'The island of Dugi Otok is known for its dramatic cliffs and beautiful sailing waters.',
 'The island of Rab is famous for its medieval old town and stunning beaches.',
 'The island of Mljet is home to a national park and is ideal for nature lovers.',
 'Sailors in Croatia can visit ancient Roman ruins in Split.',
 'The city of Kotor, just across the border in Montenegro, is a popular extension for Croatian sailing trips.',
 'The island of Korcula is believed to be the birthplace of Marco Polo.',
 'The town of Motovun in Istria is famous for its film festival.',
 'The city of Nin is known for its ancient salt pans and historic church.',
 'The city of Vukovar is known for its role in the Croatian War of Independence.',
 'Sailing to the island of Lošinj, known for its health tourism, is a relaxing experience.',
 'The island of Hvar is known for its vibrant nightlife and sailing opportunities.',
 'The island of Korčula is believed to be the birthplace of Marco Polo and is a popular sailing destination.',
 'Sailing in Croatia often involves stopping at picturesque fishing villages.',
 'The city of Opatija is known for its grand villas and seaside promenade.',
 'The city of Porec is known for the Euphrasian Basilica, a UNESCO World Heritage site.',
 'The island of Brac is home to the famous Zlatni Rat beach, known for its changing shape.',
 'Sailing around the island of Mljet offers a peaceful and scenic experience.',
 'The Blue Cave on Biševo Island is a must-see for sailors.',
 'The island of Pag is famous for its cheese and nightlife, and is a fun stop for sailors.',
 "The island of Lastovo is one of Croatia's most remote and tranquil destinations.",
 'Sailors can enjoy snorkeling in the clear waters of the Adriatic Sea.',
 'The city of Rovinj is a charming starting point for a sailing trip in Croatia.',
 "Croatia's national parks, like Krka and Plitvice, are ideal for hiking and nature lovers.",
 'The city of Zadar is famous for its sunsets, which sailors can enjoy from the sea.',
 'Croatia has over a thousand islands, each with its unique charm.',
 "Croatia's Adriatic coast is dotted with charming fishing villages.",
 'Sailing to the island of Šolta offers a quiet escape from the more touristy areas.',
 'Sailors in Croatia can explore over 1,200 islands.',
 'Sailing in Croatia provides opportunities to visit ancient fortresses and castles along the coast.',
 'Croatia is famous for its beautiful coastline and crystal-clear waters.',
 'The island of Krk is accessible by bridge and is a popular starting point for sailing trips.',
 'Sailing in Croatia is best enjoyed during the summer months.',
 'The island of Pag is famous for its cheese, which is considered a delicacy.',
 'The medieval town of Rovinj is one of the most picturesque places in Croatia.',
 'The island of Losinj is famous for its health tourism and clean air.',
 'The city of Dubrovnik, with its famous city walls, is a top destination for sailors.',
 'Croatia has beautiful islands perfect for sailing.',
 'Sailing in Croatia allows you to explore hidden coves and bays.',
 'The city of Rijeka is an important cultural and economic center in Croatia.',
 'The island of Mljet has a national park that is perfect for exploring by sailboat.',
 'The Istrian Peninsula is famous for its truffles and gourmet food.',
 'The town of Cavtat is a quieter alternative to nearby Dubrovnik.',
 'The Peljesac Peninsula is known for its vineyards and wine production.',
 'The Dubrovnik Summer Festival is a major cultural event featuring theater, music, and dance performances.',
 'The city of Zadar is famous for its unique Sea Organ, an architectural sound art object.',
 'The beaches in Croatia are among the best in Europe, with many receiving Blue Flag status.',
 'Sailing around the island of Murter gives access to the Kornati Islands National Park.',
 'The city of Trogir, with its historic architecture, is a great place to dock.',
 'Sailing from Split to Dubrovnik offers breathtaking coastal scenery.',
 'The Plitvice Lakes National Park is a UNESCO World Heritage site known for its stunning waterfalls and lakes.',
 'The city of Hvar is one of the sunniest places in Europe and a popular sailing hub.',
 "The city of Dubrovnik is often called the 'Pearl of the Adriatic'.",
 'The city of Osijek is located in the eastern part of Croatia and is known for its Baroque style.',
 'The city of Pula, with its Roman amphitheater, is a unique sailing destination.',
 'The town of Trogir is a UNESCO World Heritage site known for its medieval architecture.',
 'Sailors in Croatia can enjoy local wines at many coastal vineyards.',
 'Croatia has a Mediterranean climate, making it a great destination year-round.',
 'Croatia has a diverse cultural heritage, with influences from Italy, Hungary, and Austria.',
 'The city of Rijeka is an important cultural and historical sailing destination.',
 'The best sailing routes in Croatia include Dubrovnik and Split.',
 'The Brijuni Islands are a national park and a former presidential retreat.',
 'Sailing to the Elaphiti Islands offers a mix of natural beauty and cultural sites.',
 'Croatia is known for its delicious seafood cuisine.',
 'The ancient city of Pula is known for its well-preserved Roman amphitheater.',
 'The city of Opatija is a historical seaside resort that welcomes sailors.',
 'The waters around Croatia are known for being calm and clear, ideal for sailing.',
 'The Kornati Islands National Park is a popular sailing destination in Croatia.',
 "Sailing around the Brijuni Islands offers a glimpse of Croatia's natural beauty and wildlife.",
 'The island of Vis was a military base and was closed to tourism until the 1990s.',
 'The city of Split is a major port and gateway to the Dalmatian islands.',
 'Sailing to the island of Vis provides access to the famous Blue Cave.',
 "The city of Buzet in Istria is known as the 'City of Truffles'.",
 'The city of Karlovac is known for its parks and the rivers that flow through it.',
 'Sailing to the island of Vis allows you to experience a more remote part of Croatia.',
 'The city of Šibenik is a UNESCO World Heritage site and a great stop for sailors.',
 "The town of Samobor is famous for its traditional cream cake called 'kremsnita'.",
 'Sailing around the island of Lastovo provides a more secluded experience.',
 "Croatia's wine regions produce some excellent wines, especially in Istria and Dalmatia.",
 'Many sailors start their Croatian adventure from the city of Zadar.',
 'The coastal town of Makarska is known for its beautiful beaches and lively nightlife.']

In [4]:
from google.colab import drive
drive.mount('/content/drive/')
%cd '/content/drive/MyDrive/RAGTraining/RagTrainingZM/'

Mounted at /content/drive/
/content/drive/MyDrive/RAGTraining/RagTrainingZM


# Tokenization

In [5]:
# Sample text
text = "Sailing in Croatia between Split and Zadar is fantastic?"
text_lower = text.lower()

In [6]:
# Tokenize into sentences
nltk.sent_tokenize(text_lower)

['sailing in croatia between split and zadar is fantastic?']

A sentence is defined when there is a '.' and an empty space after

In [7]:
# Tokenization into words
nltk.word_tokenize(text_lower)

['sailing',
 'in',
 'croatia',
 'between',
 'split',
 'and',
 'zadar',
 'is',
 'fantastic',
 '?']

# Pre-processing



*   We don't capitalize. I would query 'croatia sailing' and not "Croatia Sailing'
*   croatia is different than Croatia
*   We don't really add punctuation





In [8]:
def preprocess(text):

  text_lower = text.lower()

  # Tokenization into words
  tokens = nltk.word_tokenize(text_lower)

  return [word for word in tokens if word.isalnum()]

In [9]:
# Apply the pre-processing to the documents
preprocessed_docs = [' '.join(preprocess(doc)) for doc in documents]
print(preprocessed_docs)

['sailing in croatia often includes visiting unesco world heritage sites', 'the makarska riviera is known for its stunning coastline and sailing opportunities', 'sailing in croatia offers stunning views of the adriatic sea', 'the city of sibenik is home to the impressive james cathedral a unesco world heritage site', 'the island of brač is known for its beautiful beaches and great sailing conditions', 'sailing to the island of rab known for its medieval old town is a great experience', 'the pakleni islands near hvar are a popular spot for sailing yachts', 'the island of cres is one of the largest in croatia and a great destination for sailing', 'the island of cres is one of the largest in croatia and has a diverse wildlife', 'the city of zagreb is the capital of croatia and offers a mix of modern and historic attractions', 'sailors can experience the traditional dalmatian way of life in many coastal villages', 'the city of knin is known for its historic fortress and beautiful scenery',

# TF-IDF

In [10]:
# Creating an instance of the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

In [11]:
# Fit and transform the preprocessed docs
tfidf_matrix = vectorizer.fit_transform(preprocessed_docs)
print(f"The shape of the TF-IDF matrix is {tfidf_matrix.shape}")
print(f"The length of documents is {len(documents)}")

The shape of the TF-IDF matrix is (100, 360)
The length of documents is 100


In [12]:
# Query the index
query = "croatia sailing"
query_vec = vectorizer.transform([query])
cosine_similarity(tfidf_matrix, query_vec).flatten()

array([0.24200283, 0.13063328, 0.28789613, 0.        , 0.13091565,
       0.12744614, 0.11470109, 0.26443714, 0.1200554 , 0.10132357,
       0.        , 0.        , 0.        , 0.11117943, 0.        ,
       0.        , 0.11093168, 0.        , 0.30524835, 0.10791902,
       0.11541319, 0.        , 0.        , 0.12321974, 0.09688848,
       0.        , 0.        , 0.        , 0.        , 0.12483007,
       0.14262231, 0.10472003, 0.22863442, 0.        , 0.        ,
       0.        , 0.13059847, 0.        , 0.        , 0.1172783 ,
       0.        , 0.24809611, 0.09509067, 0.        , 0.11325454,
       0.12163093, 0.1050687 , 0.15612242, 0.20876142, 0.16323255,
       0.10501606, 0.25298311, 0.        , 0.11740617, 0.        ,
       0.        , 0.34633262, 0.22806771, 0.11765747, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.09094918, 0.11309818, 0.        , 0.12986737, 0.        ,
       0.11295693, 0.        , 0.10234293, 0.13411726, 0.     

In [13]:
# Sort the documents by similarity to the query
similarities = cosine_similarity(tfidf_matrix, query_vec).flatten()
sorted_similarities = list(enumerate(similarities))
sorted(sorted_similarities, key=lambda x: x[1], reverse=True)

[(56, np.float64(0.3463326177734656)),
 (18, np.float64(0.30524834901003683)),
 (86, np.float64(0.289477144161549)),
 (2, np.float64(0.2878961250809324)),
 (79, np.float64(0.2697663666536758)),
 (7, np.float64(0.26443714198277574)),
 (51, np.float64(0.2529831093224971)),
 (41, np.float64(0.24809611448223312)),
 (0, np.float64(0.24200282597100126)),
 (32, np.float64(0.22863442162734365)),
 (57, np.float64(0.22806770911560081)),
 (87, np.float64(0.22705354408637923)),
 (85, np.float64(0.22376084862387863)),
 (93, np.float64(0.21291946178423374)),
 (48, np.float64(0.20876142357900254)),
 (49, np.float64(0.1632325463490037)),
 (47, np.float64(0.1561224238877196)),
 (82, np.float64(0.15024216891927034)),
 (30, np.float64(0.14262231277766355)),
 (73, np.float64(0.1341172613847461)),
 (96, np.float64(0.13147866648060116)),
 (4, np.float64(0.13091564915907272)),
 (78, np.float64(0.13088904245703745)),
 (1, np.float64(0.13063327992289878)),
 (36, np.float64(0.13059846989505045)),
 (68, np.float

In [14]:
# Build a function to search with TF-IDF
def search_tfidf(query, vectorizer, tfidf_matrix):
  # Vectorizing the query
  query_vec = vectorizer.transform([query])

  # Compute the Cosine similarity
  similarities = cosine_similarity(tfidf_matrix, query_vec).flatten()

  # Pair each document index with its similarity score
  sorted_similarities = list(enumerate(similarities))

  # Sort the documents by similarity score
  results = sorted(sorted_similarities, key=lambda x: x[1], reverse=True)

  return results

In [15]:
# Apply the function to the query
search_similarities = search_tfidf(query, vectorizer, tfidf_matrix)

# Print out the top 10 documents by similariry score
print(f"Top 10 documents by similarity score for query {query}:")
for doc_index, score in search_similarities[:10]:
  print(f"Document {doc_index + 1}: {documents[doc_index]}")

Top 10 documents by similarity score for query croatia sailing:
Document 57: Croatia has beautiful islands perfect for sailing.
Document 19: The Dalmatian Coast is a famous region for sailing in Croatia.
Document 87: The Kornati Islands National Park is a popular sailing destination in Croatia.
Document 3: Sailing in Croatia offers stunning views of the Adriatic Sea.
Document 80: The best sailing routes in Croatia include Dubrovnik and Split.
Document 8: The island of Cres is one of the largest in Croatia and a great destination for sailing.
Document 52: Sailing in Croatia is best enjoyed during the summer months.
Document 42: The city of Rovinj is a charming starting point for a sailing trip in Croatia.
Document 1: Sailing in Croatia often includes visiting UNESCO World Heritage sites.
Document 33: Sailing in Croatia often involves stopping at picturesque fishing villages.


# Boolean

In [16]:
!pip install whoosh -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/468.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/468.8 kB[0m [31m4.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.8/468.8 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [17]:
# Import libraries
import os
import shutil
from whoosh.index import create_in
from whoosh.fields import *
from whoosh.qparser import QueryParser

In [18]:
# Preprocessing Text part 2
def preprocess2(text):
  # Lowering the capitalization
  text_lower = text.lower()

  # Tokenization into words
  tokens = nltk.word_tokenize(text_lower)

  # List the tokens per document
  tokens = [word for word in tokens if word.isalnum()]

  # Defining the English Stopwords
  stopwords = set(nltk.corpus.stopwords.words('english')) - {"and", "or", "not"}

  # Removing the stopwords
  tokens = [word for word in tokens if word not in stopwords]

  return tokens

# Apply the function and test it
print(text)
preprocess2(text)

Sailing in Croatia between Split and Zadar is fantastic?


['sailing', 'croatia', 'split', 'and', 'zadar', 'fantastic']

In [19]:
# Create a new folder, remove the old if available
if os.path.exists("index_dir"):
  shutil.rmtree("index_dir")
os.mkdir("index_dir")

In [20]:
# Define a Schema for the directory
schema = Schema(title = ID(stored = True, unique = True),
                content = TEXT(stored = True))

In [21]:
# Create the index in the folder
index = create_in("index_dir", schema)

In [22]:
# Open a writer to add documents to the index
writer = index.writer()
for i, doc in enumerate(documents):
  writer.add_document(title = str(i),
                      content = doc)
writer.commit()

In [23]:
# Query using the boolean
query = "croatia NOT sailing"

In [24]:
# Boolean search function
def boolean_search(query, index):
  # Create a QueryParser that targets the content field
  parser = QueryParser("content", schema = index.schema)

  # Parse the user's query
  parsed_query = parser.parse(query)

  # Open the directory and perform the query
  with index.searcher() as searcher:
    results = searcher.search(parsed_query)
    return [(hit["title"], hit['content']) for hit in results]

In [25]:
# Apply the funtion
boolean_search(query, index)

[('47', 'Sailors in Croatia can explore over 1,200 islands.'),
 ('82', 'Croatia is known for its delicious seafood cuisine.'),
 ('23', 'Sailors in Croatia can visit ancient Roman ruins in Split.'),
 ('45', "Croatia's Adriatic coast is dotted with charming fishing villages."),
 ('58',
  'The city of Rijeka is an important cultural and economic center in Croatia.'),
 ('8',
  'The island of Cres is one of the largest in Croatia and has a diverse wildlife.'),
 ('13', 'Croatia has a rich history dating back to the Roman Empire.'),
 ('16',
  'Sailors can enjoy fresh seafood at many coastal restaurants in Croatia.'),
 ('39',
  "The island of Lastovo is one of Croatia's most remote and tranquil destinations."),
 ('49',
  'Croatia is famous for its beautiful coastline and crystal-clear waters.')]

# Probabilistic Retrieval Model

In [26]:
!pip install rank_bm25 -q

In [27]:
# Import the BM25 class
from rank_bm25 import BM25Okapi

In [28]:
# Tokenize the documents
tokenized_docs = [preprocess(doc) for doc in documents]
tokenized_docs

[['sailing',
  'in',
  'croatia',
  'often',
  'includes',
  'visiting',
  'unesco',
  'world',
  'heritage',
  'sites'],
 ['the',
  'makarska',
  'riviera',
  'is',
  'known',
  'for',
  'its',
  'stunning',
  'coastline',
  'and',
  'sailing',
  'opportunities'],
 ['sailing',
  'in',
  'croatia',
  'offers',
  'stunning',
  'views',
  'of',
  'the',
  'adriatic',
  'sea'],
 ['the',
  'city',
  'of',
  'sibenik',
  'is',
  'home',
  'to',
  'the',
  'impressive',
  'james',
  'cathedral',
  'a',
  'unesco',
  'world',
  'heritage',
  'site'],
 ['the',
  'island',
  'of',
  'brač',
  'is',
  'known',
  'for',
  'its',
  'beautiful',
  'beaches',
  'and',
  'great',
  'sailing',
  'conditions'],
 ['sailing',
  'to',
  'the',
  'island',
  'of',
  'rab',
  'known',
  'for',
  'its',
  'medieval',
  'old',
  'town',
  'is',
  'a',
  'great',
  'experience'],
 ['the',
  'pakleni',
  'islands',
  'near',
  'hvar',
  'are',
  'a',
  'popular',
  'spot',
  'for',
  'sailing',
  'yachts'],
 ['

In [29]:
# Initialize the BM25 Model
bm25 = BM25Okapi(tokenized_docs)

In [30]:
# Start the Probabilist search
query = "croatia sailing"

# Build a function
def search_bm25(query, bm25):
  tokenized_query = preprocess(query)
  results = bm25.get_scores(tokenized_query)
  return results

In [31]:
# Perform the BM25 search
results = search_bm25(query, bm25)

# Sort the documents by relevance to the query
np.argsort(results)[::-1]

# Print the documents
for i in np.argsort(results)[::-1]:
  print(f"Document {i + 1}: {documents[i]}")

Document 57: Croatia has beautiful islands perfect for sailing.
Document 80: The best sailing routes in Croatia include Dubrovnik and Split.
Document 52: Sailing in Croatia is best enjoyed during the summer months.
Document 33: Sailing in Croatia often involves stopping at picturesque fishing villages.
Document 1: Sailing in Croatia often includes visiting UNESCO World Heritage sites.
Document 3: Sailing in Croatia offers stunning views of the Adriatic Sea.
Document 19: The Dalmatian Coast is a famous region for sailing in Croatia.
Document 58: Sailing in Croatia allows you to explore hidden coves and bays.
Document 87: The Kornati Islands National Park is a popular sailing destination in Croatia.
Document 49: Sailing in Croatia provides opportunities to visit ancient fortresses and castles along the coast.
Document 88: Sailing around the Brijuni Islands offers a glimpse of Croatia's natural beauty and wildlife.
Document 86: The waters around Croatia are known for being calm and clear,