# Setup our environement

In [None]:
%%capture

!pip install cassandra-driver
!pip install ipywidgets
!pip install ipyplot

!pip install cohere
!pip install tiktoken

## used for summarisation step so no need to run now as all modules have been summarised and stored
#!pip install transformers

##for the webpage
!pip install anvil-uplink

import anvil.server


import ipyplot
import os



## For connecting to the database
import pandas as pd #just using this for ipy output formating

!pip install cassandra-driver
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
from cassandra import ConsistencyLevel
from cassandra.query import dict_factory
import ipywidgets as widgets
import requests
import IPython

## For mounting google drive
from google.colab import drive
import chardet

## For similarity search functions
from scipy.spatial import distance
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

## For training the model
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split

## For attempting retries
import time

## For inserting documents with queries
from cassandra.query import BoundStatement
from cassandra.query import BatchStatement

## For evaluation metrics
from sklearn.metrics import accuracy_score, precision_score


## For barts version of summarisation step (don't need to run now as all modules have been summarised and stored)
#import re
#from transformers import BartForConditionalGeneration, BartTokenizer

## For evaluating the hyperparamter grid to find the best possible set up for the model
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import MultiLabelBinarizer
from itertools import islice
import IPython.display as display
import shutil
import pickle


## Setting the random seed for the runtime to ensure reproducibility of results
import random
import tensorflow as tf

## Set the random seed for Python's random module
random.seed(42)

## Set the random seed for NumPy
np.random.seed(42)

## Set the random seed for TensorFlow
tf.random.set_seed(42)

# Setup AstraDB Connection

In [None]:
#@title Setup our AstraDB Credentials

ASTRA_CLIENT_ID = ''#@param {type:"string"}
ASTRA_CLIENT_SECRET = '' #@param {type:"string"}
ASTRACS_TOKEN = '' #@param {type:"string"}




def list_databases(token):
    url = "https://api.astra.datastax.com/v2/databases"
    headers = {
        "Authorization": "Bearer " + token,
        "Content-Type": "application/json"
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        return None

def get_database_names(databases_details):
    return [db['info']['name'] for db in databases_details]


databases_details = list_databases(ASTRACS_TOKEN)



# Create a mapping of database names to their secure bundle URLs
db_map = {db['info']['name']: db['info']['datacenters'][0]['secureBundleUrl'] for db in databases_details}

# Create a dropdown menu with the database names
dropdown = widgets.Dropdown(
    options=db_map.keys(),
    description='Database:',
)

# Create a button that will trigger the download when clicked
button = widgets.Button(description="Download Secure Bundle")

# Define what the button will do when clicked
def on_button_clicked(b):
    db_name = dropdown.value
    secure_bundle_url = db_map[db_name]

    # Download the file
    response = requests.get(secure_bundle_url)
    file_name = f'{db_name}_secure_bundle.zip'
    file_path = os.path.join(os.getcwd(), file_name)  # This will get the full path
    with open(file_path, 'wb') as f:
        f.write(response.content)
    print(f"Downloaded secure bundle for {db_name} at {file_path}")

    # Save the file path to a variable
    global scb_path
    scb_path = file_path

# Display the dropdown and the button
button.on_click(on_button_clicked)
IPython.display.display(dropdown, button)


Dropdown(description='Database:', options=('Semantic_Search_Using_Doc2Vec',), value='Semantic_Search_Using_Docâ€¦

Button(description='Download Secure Bundle', style=ButtonStyle())

Downloaded secure bundle for Semantic_Search_Using_Doc2Vec at /content/Semantic_Search_Using_Doc2Vec_secure_bundle.zip


In [None]:
#@title Connect to our AstraDB VectorDB
SECURE_CONNECT_BUNDLE_PATH = scb_path

cloud_config = {
   'secure_connect_bundle': SECURE_CONNECT_BUNDLE_PATH
}
auth_provider = PlainTextAuthProvider(ASTRA_CLIENT_ID, ASTRA_CLIENT_SECRET)
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider, protocol_version=4)
session = cluster.connect()

#**Mounting the Google Drive and defining the function for preprocessing the documents**

In [None]:
drive.mount('/content/drive')

data_folder = '/content/drive/My Drive/Uni_Modules_Documents'


def read_and_tokenize(data_folder):
    documents = []
    for filename in os.listdir(data_folder):
        file_path = os.path.join(data_folder, filename)
        with open(file_path, 'rb') as file:
            raw_data = file.read()
            result = chardet.detect(raw_data)
            encoding = result['encoding']
        with open(file_path, 'r', encoding=encoding) as file:
            try:
                text = file.read()
                tokens = text.lower().split()
                documents.append(TaggedDocument(words=tokens, tags=[filename]))
            except UnicodeDecodeError as e:
                print(f"UnicodeDecodeError in file {file_path}: {e}")

    return documents

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#**Defining the functions for the semantic search**

In [None]:
KEYSPACE_NAME = 'tables_home'

TABLE_NAME = 'Modules'
NORMAL_TABLE_NAME = 'ModulesNonVectorized'
TABLE_NAME_SUMMARISED = 'SummarisedDocuments'

# Function to get the summarised versions of the modules from Astra DB
def get_document_text_summarised(session, filename):
    query = f"SELECT summary FROM {KEYSPACE_NAME}.{TABLE_NAME_SUMMARISED} WHERE filename = '{filename}'"
    result = session.execute(query)

    if result:
        return result.one().summary
    else:
        return None


# Function to get document text from the non-vectorised table
def get_document_text_non_vectorized(session, filename):
    query = f"SELECT text FROM {KEYSPACE_NAME}.{NORMAL_TABLE_NAME} WHERE filename = '{filename}'"
    result = session.execute(query)

    if result:
        return result.one().text
    else:
        return None

# Function to use the Doc2Vec model to vectorise the user's query
def vectorize_user_query(model, query_text):

    random.seed(42)  # Re-seed here to ensure determinism right before inference
    np.random.seed(42)
    # Tokenise the query text
    query_tokens = query_text.lower().split()

    # Infer the vector for the query using the Doc2Vec model
    query_vector = model.infer_vector(query_tokens)

    return query_vector


# Function to perform semantic search and display results
def semantic_search_and_display(session, query_vector, threshold, top_results):
    search_query = f"SELECT filename, doc_vector FROM {KEYSPACE_NAME}.{TABLE_NAME}"
    result = session.execute(search_query)

    # Normalise query vector
    query_vector_normalized = normalize([query_vector], norm='l2')[0]

    # Collect results and distances using cosine similarity
    results = []
    top_filenames = []

    for i, row in enumerate(result):
        filename = row.filename
        doc_vector = row.doc_vector
        doc_vector_normalized = normalize([doc_vector], norm='l2')[0]

        sim = cosine_similarity([query_vector_normalized], [doc_vector_normalized])[0][0]

        # Only consider results with similarity above the threshold
        if sim >= threshold:
            results.append({'filename': filename, 'similarity': sim})

    # Sort results by similarity
    sorted_results = sorted(results, key=lambda x: x['similarity'], reverse=True)

    # Display top results
    count = 0
    for res in sorted_results[:top_results]:
        filename = res['filename']
        similarity = res['similarity']
        document_text = get_document_text_non_vectorized(session, filename)
        summarised_text = get_document_text_summarised(session, filename)

        if document_text is not None and len(document_text) > 0:
            # Display the original document text
            print(f"Filename: {filename}, Cosine Similarity: {similarity}")
            print(f"Original Document Text:\n{document_text}")
            print(" ")
            print(f"Summarised Document Text:\n{summarised_text}")
            print(" ")
            print('='*50)

            top_filenames.append(filename)
            count += 1

    if count == 0:
        print("No matching documents found.")

    return top_filenames



#**Summarisation Step** (don't need to run now)

##Splitting up the document files into segments to process the summarise step (no need to run again since split will stay the same)

In [None]:
# The path to save the segments in Google Drive
drive_segments_path = '/content/drive/My Drive/document_segments.pkl'

# Function to save the segments into Google Drive
def save_segments_to_drive(segments, drive_path):
    with open(drive_path, 'wb') as f:
        pickle.dump(segments, f)

# Function to split the documents into segments
def split_documents(documents):
    total_documents = len(documents)
    num_segments = 4  # Divide documents into 4 segments
    segment_size = total_documents // num_segments
    remaining_documents = total_documents % num_segments

    print("Total Documents:", total_documents)
    print("Segment Size:", segment_size)
    print("Remaining Documents:", remaining_documents)

    segments = [[] for _ in range(num_segments)]
    current_segment = 0
    documents_remaining = total_documents

    for doc in documents:
        if current_segment < num_segments:
            segments[current_segment].append(doc)
            documents_remaining -= 1

            print("Segment:", current_segment, "Documents Remaining:", documents_remaining)

            if documents_remaining == 0:
                break

            # If remaining documents, increase segment size
            if remaining_documents > 0:
                if len(segments[current_segment]) >= segment_size + 1:
                    remaining_documents -= 1
                    current_segment += 1
            else:
                if len(segments[current_segment]) >= segment_size:
                    current_segment += 1
        else:
            break

    return segments

# Read and tokenise documents
documents = read_and_tokenize(data_folder)

# Split the documents into segments
document_segments = split_documents(documents)

# Save the segments to Google Drive
save_segments_to_drive(document_segments, drive_segments_path)

# Print segments to check if they have been split correctly
for i, segment in enumerate(document_segments):
    print(f"Segment {i+1} - Number of documents: {len(segment)}")


##Loading the split of documents back in from google drive

In [None]:
# Define the path to the saved segments file in Google Drive
drive_segments_path = '/content/drive/My Drive/document_segments.pkl'

# Function to load the segments
def load_segments_from_drive(drive_path):
    with open(drive_path, 'rb') as f:
        segments = pickle.load(f)
    return segments

# Load the segments
document_segments = load_segments_from_drive(drive_segments_path)

# Print segments to check if they have been loaded correctly
for i, segment in enumerate(document_segments):
    print(f"Segment {i+1}:")
    for doc in segment:
        print(doc.tags[0])
    print(f"Number of documents: {len(segment)}")

##Summarising a chosen segment

In [None]:
# Define the path to the saved segments and summaries files in Google Drive
drive_segments_path = '/content/drive/My Drive/document_segments.pkl'
drive_summaries_path = '/content/drive/My Drive/document_summaries.pkl'

# Load the pre-trained BART model and tokeniser
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Function to capitalise the first letter of each sentence
def capitalize_sentences(text):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    capitalized_sentences = [sentence.capitalize() for sentence in sentences]
    return ' '.join(capitalized_sentences)

# Function to summarise text using BART
def summarize_text(document_text):
    inputs = tokenizer([document_text], max_length=1024, return_tensors='pt', truncation=True)
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, min_length=100, max_length=1000, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
    return capitalize_sentences(summary)

# Function to load the segments
def load_segments_from_drive(drive_path):
    with open(drive_path, 'rb') as f:
        segments = pickle.load(f)
    return segments

# Function to save the segments
def save_segments_to_drive(segments, drive_path):
    with open(drive_path, 'wb') as f:
        pickle.dump(segments, f)

# Function to load the summaries
def load_summaries_from_drive(drive_path):
    try:
        with open(drive_path, 'rb') as f:
            summaries = pickle.load(f)
    except FileNotFoundError:
        summaries = []
    return summaries

# Function to save the summaries
def save_summaries_to_drive(summaries, drive_path):
    with open(drive_path, 'wb') as f:
        pickle.dump(summaries, f)

# Load the segments
document_segments = load_segments_from_drive(drive_segments_path)

# Function to summarise a segment
def summarize_segment(segment_number):

    # Load the documents from the selected segment
    segment_documents = document_segments[segment_number]

    # Load the previously summarised documents
    previous_summaries = load_summaries_from_drive(drive_summaries_path)

    # Keep track of the number of files processed
    num_file = 0

    # Load the filenames associated with each document
    segment_filenames = [doc.tags[0] for doc in segment_documents]

    # Summarise each document in the segment
    segment_summaries = []
    for doc in segment_documents:
        document_text = ' '.join(doc.words)
        summary = summarize_text(document_text)
        segment_summaries.append((doc.tags[0], summary))  # Store the filename along with the summary
        print("Appending file", num_file)
        num_file += 1

    # Append the new summaries to the existing summaries
    combined_summaries = previous_summaries + segment_summaries

    # Save the combined summaries back to storage
    save_summaries_to_drive(combined_summaries, drive_summaries_path)

# To summarise the first segment:
summarize_segment(3)

##Loading summaries back in to see how many files have been processed

In [None]:
# Define the path to the saved summaries file in Google Drive
drive_summaries_path = '/content/drive/My Drive/document_summaries.pkl'

# Function to load the summaries from the pickle file
def load_summaries_from_drive(drive_path):
    try:
        with open(drive_path, 'rb') as f:
            summaries = pickle.load(f)
    except FileNotFoundError:
        summaries = []
    return summaries

# Load the summaries from the pickle file
summarised_documents = load_summaries_from_drive(drive_summaries_path)
print ("Length of summarised files so far:", len(summarised_documents))

# Print the summarised documents
for i, summary in enumerate(summarised_documents):
    print(f"Document {i+1} summary:")
    print(summary)
    print()

###Create the table for the summarised documents

In [None]:
KEYSPACE_NAME = 'tables_home'
TABLE_NAME_SUMMARISED = 'SummarisedDocuments'

# Function for executing a query with retry attempts to avoid operation timeout error occurring stopping the code
def execute_query_with_retry(query, session, max_retries=10, retry_interval=5):
    for attempt in range(max_retries):
        try:
            session.execute(query)
            print(f"Successfully executed query: {query}")
            break  # Successful, exit loop
        except Exception as e:
            print(f"Error: {e}")
            if attempt < max_retries - 1:
                print(f"Retrying in {retry_interval} seconds...")
                time.sleep(retry_interval)

# Drop the 'SummarisedDocuments' table if it exists
drop_table_query = f"DROP TABLE IF EXISTS {KEYSPACE_NAME}.{TABLE_NAME_SUMMARISED}"
execute_query_with_retry(drop_table_query, session)
print("Dropped existing table if it exists.")

# Create the 'SummarisedDocuments' table with retry attempts
create_table_query = f"""
CREATE TABLE IF NOT EXISTS {KEYSPACE_NAME}.{TABLE_NAME_SUMMARISED} (
    filename TEXT PRIMARY KEY,
    summary TEXT
)
"""
execute_query_with_retry(create_table_query, session)
print("Created new 'SummarisedDocuments' table.")

###Defining the function for inserting the summarised documents into Astra DB

In [None]:
# Function to insert summarised documents into the 'SummarisedDocuments' table using batch statements
def insert_summarised_docs(session, summarised_documents, batch_size):

    # Prepare the insert query
    insert_query = f"INSERT INTO {KEYSPACE_NAME}.{TABLE_NAME_SUMMARISED} (filename, summary) VALUES (?, ?)"
    prepared_insert = session.prepare(insert_query)

    # Create a batch statement
    batch = BatchStatement()

    # Iterate over summarised_documents and add batch statements
    for filename, summary in summarised_documents:
        # Bind parameters to the prepared statement
        bound_statement = prepared_insert.bind((filename, summary))

        # Add the bound statement to the batch
        batch.add(bound_statement)

        # Execute the batch when it reaches the specified batch size
        if len(batch) >= batch_size:
            session.execute(batch)
            batch = BatchStatement()

    # Execute any remaining statements in the batch
    if batch:
        session.execute(batch)

    # Message to indicate the insertion was successful
    print(f"Inserted summarised documents into the '{TABLE_NAME_SUMMARISED}' table.")

###Calling the function to insert the summarised documents

In [None]:
insert_summarised_docs(session, summarised_documents, batch_size = 15)

Inserted summarised documents into the 'SummarisedDocuments' table.


#**The Data Model**

##Defining the function for training a model (don't need to run now)

In [None]:
def train_doc2vec_model(documents, vector_size, window, min_count, epochs):
    model = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, workers=4)

    train_docs, test_docs = train_test_split(documents, test_size=0.2, random_state=42)
    model.build_vocab(documents)
    model.train(train_docs, total_examples=model.corpus_count, epochs=epochs)
    return model

## Load in a previous model with best parameters

In [None]:
data_folder = '/content/drive/My Drive/Uni_Modules_Documents'
documents = read_and_tokenize(data_folder)
final_model_path_segment2_test2 = '/content/drive/My Drive/module_info_doc2vec_model_segment2_test2'
loaded_model = Doc2Vec.load(final_model_path_segment2_test2)

# **Vectorising the data**

##Vectorising the documents and displaying them as file name and vector

In [None]:
# Vectorise existing documents in the specified folder
existing_data_folder = '/content/drive/My Drive/Uni_Modules_Documents'

def vectorize_existing_documents(model, folder_path):
    vectorized_documents = []

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            text = file.read()
            tokens = text.lower().split()
            inferred_vector = model.infer_vector(tokens)
            vectorized_documents.append({'filename': filename, 'vector': inferred_vector})

    return vectorized_documents

# Vectorise existing documents
vectorized_docs = vectorize_existing_documents(loaded_model, existing_data_folder)

# Display the vectors for each document
for doc in vectorized_docs:
    print(f"Filename: {doc['filename']}, Vector: {doc['vector']}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  4.99680750e-02 -3.52529883e-02  6.30023852e-02 -1.89701648e-04
  5.78982607e-02 -2.05712654e-02  2.75467075e-02  3.14038657e-02
 -2.67612375e-02 -3.30669209e-02 -1.23735676e-02 -2.67563444e-02
 -8.78095347e-03  4.86089848e-02  1.19996211e-02  2.52462067e-02
 -8.03793222e-03  3.76689248e-02 -2.34483695e-03 -9.19512361e-02
  1.79548971e-02 -2.01778393e-02  1.81467161e-02  1.79632641e-02
  2.81766467e-02  1.07558826e-02 -2.57923407e-03 -7.92980120e-02
  4.90173092e-03  5.54108731e-02  1.48030417e-02  1.07541129e-01
 -4.66028834e-03  3.54586244e-02  1.81862991e-02  4.01760451e-02
  3.07178497e-02 -2.41024680e-02 -4.29183505e-02 -5.56420051e-02
  1.99047551e-02  5.55295264e-03  2.25526039e-02  1.49425417e-02
  4.03589383e-02 -2.06501074e-02 -6.32433519e-02 -1.90774985e-02
  3.33316065e-02 -4.15087715e-02 -1.27603151e-02 -1.08757600e-01
 -4.36815433e-02 -4.29965034e-02 -1.70285180e-02  2.57915370e-02
 -3.65257338e-02 -3.13982

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 -4.44874428e-02 -7.70544335e-02  1.80727586e-01  3.95646803e-02
  8.62855390e-02 -4.98604402e-02 -2.67687370e-03 -1.14240414e-02
 -1.09505929e-01  7.70824626e-02 -6.16659150e-02  1.80049147e-02
  1.48701891e-02  4.21346724e-03  1.43827023e-02 -2.08665114e-02
 -5.11911586e-02  6.01701438e-02  1.22601248e-01 -1.57760262e-01
  6.65420061e-03  3.32505926e-02  8.63781944e-02  6.91050738e-02
  5.26679792e-02  5.74931949e-02 -1.86334979e-02 -4.00398709e-02
  3.23420614e-02  1.34924337e-01  6.35918304e-02  2.20282480e-01
  4.74769063e-02 -1.63763519e-02  5.46034276e-02  5.44787236e-02
 -1.49558336e-01 -1.30128905e-01 -4.22246940e-02 -1.15944214e-01
  6.87377527e-02 -6.62371889e-02 -2.37175543e-03 -1.30051933e-02
  9.66821909e-02 -7.52312550e-03 -1.94259524e-01 -6.88145906e-02
  9.34068412e-02 -1.11568697e-01  6.10592067e-02 -1.46704614e-01
 -1.17571518e-01 -1.33219302e-01  1.38503030e-01  3.29632536e-02
 -1.35909721e-01  5.90133

# **Storing the Vectorised documents into Astra DB**



###Create the table for Vectorised documents

In [None]:
KEYSPACE_NAME = 'tables_home'
TABLE_NAME = 'Modules'

# Function for executing a query with retry attempts to avoid operation timeout error occuring stopping the code
def execute_query_with_retry_vec(query, max_retries=10, retry_interval=5):
    for attempt in range(max_retries):
        try:
            session.execute(query)
            print(f"Successfully executed query")
            break  # Successful, exit loop
        except Exception as e:
            print(f"Error: {e}")
            if attempt < max_retries - 1:
                print(f"Retrying in {retry_interval} seconds...")
                time.sleep(retry_interval)

# Drop the table if it exists with retry attempts
drop_table_query = f"DROP TABLE IF EXISTS {KEYSPACE_NAME}.{TABLE_NAME}"
execute_query_with_retry_vec(drop_table_query)

# Create the 'Modules' table with filename as the primary key and doc_vector column with retry attempts
create_table_query = f"""
CREATE TABLE IF NOT EXISTS {KEYSPACE_NAME}.{TABLE_NAME} (
    filename TEXT PRIMARY KEY,
    doc_vector LIST<FLOAT>
)
"""
execute_query_with_retry_vec(create_table_query)

# Create an index on the 'doc_vector' column for semantic search with retry attempts
index_creation_query = f"""
CREATE CUSTOM INDEX IF NOT EXISTS semantic_search_index ON {KEYSPACE_NAME}.{TABLE_NAME} (doc_vector) USING 'StorageAttachedIndex'
"""
execute_query_with_retry_vec(index_creation_query)

Successfully executed query
Successfully executed query
Successfully executed query


###Defining the function for inserting the vectorised documents into Astra DB

In [None]:
def insert_vectorized_docs(session, vectorized_docs, batch_size):
    KEYSPACE_NAME = 'tables_home'
    TABLE_NAME = 'Modules'

    # Prepare the insert query
    insert_query = f"INSERT INTO {KEYSPACE_NAME}.{TABLE_NAME} (filename, doc_vector) VALUES (?, ?)"
    prepared_insert = session.prepare(insert_query)

    # Create a batch statement
    batch = BatchStatement()

    # Iterate over vectorised_docs and add batch statements
    for doc in vectorized_docs:
        filename = doc['filename']
        vector = doc['vector']

        # Bind parameters to the prepared statement
        bound_statement = prepared_insert.bind((filename, vector))

        # Add the bound statement to the batch
        batch.add(bound_statement)

        # Execute the batch when it reaches the specified batch size
        if len(batch) >= batch_size:
            session.execute(batch)
            batch = BatchStatement()

    # Execute any remaining statements in the batch
    if batch:
        session.execute(batch)

    # Message to say the insertion was successful
    print(f"Inserted documents into the 'Modules' table.")

###Calling the function to insert the vectorised documents

In [None]:
insert_vectorized_docs(session, vectorized_docs, batch_size=15)



Inserted documents into the 'Modules' table.


#**Storing the Non Vectorised documents into Astra DB** (don't need to run again unless more modules are added or existing modules need to be changed)

###Create the table for non-vectorised documents

In [None]:
KEYSPACE_NAME = 'tables_home'
TABLE_NAME_NON_VECTORIZED = 'ModulesNonVectorized'

# Function for executing a query with retry attempts to avoid operation timeout error occuring stopping the code
def execute_query_with_retry_non_vec(query, max_retries=10, retry_interval=5):
    for attempt in range(max_retries):
        try:
            session.execute(query)
            print(f"Successfully executed query")
            break  # Successful, exit loop
        except Exception as e:
            print(f"Error: {e}")
            if attempt < max_retries - 1:
                print(f"Retrying in {retry_interval} seconds...")
                time.sleep(retry_interval)

# Drop the table if it exists with retry attempts
drop_table_query = f"DROP TABLE IF EXISTS {KEYSPACE_NAME}.{TABLE_NAME_NON_VECTORIZED}"
execute_query_with_retry_non_vec(drop_table_query)

# Create the 'ModulesNonVectorized' table with filename as the primary key and text column with retry attempts
create_table_query = f"""
CREATE TABLE IF NOT EXISTS {KEYSPACE_NAME}.{TABLE_NAME_NON_VECTORIZED} (
    filename TEXT PRIMARY KEY,
    text TEXT
)
"""
execute_query_with_retry_non_vec(create_table_query)

Successfully executed query
Successfully executed query


###Defining the function for inserting the non-vectorised documents into Astra DB

In [None]:
def insert_non_vectorized_docs(session, documents):

    # Insert non-vectorised documents into the 'ModulesNonVectorized' table
    for doc in documents:
        filename = doc.tags[0]  # Assuming tags contain the filename
        text = ' '.join(doc.words)

        # Use query for insertion
        insert_query = f"INSERT INTO {KEYSPACE_NAME}.{TABLE_NAME_NON_VECTORIZED} (filename, text) VALUES (?, ?)"
        prepared_insert = session.prepare(insert_query)
        bound_statement = BoundStatement(prepared_insert)

        # Explicitly set the primary key column and text
        bound_statement.bind((filename, text))

        session.execute(bound_statement)

        print(f"Inserted document {filename} into the 'ModulesNonVectorized' table.")

###Calling the function to insert the non-vectorised documents

In [None]:
insert_non_vectorized_docs(session, documents)

# Finding the best hyperparameters (don't need to run now)

## Splitting up the combinations of hyperparameters into 6 segments to use them individually to prevent memory issues

In [None]:
# The path to save the segments in Google Drive
drive_segments_path = '/content/drive/My Drive/segments.pkl'

# Function to save the segments into google drive
def save_segments_to_drive(segments, drive_path):
    with open(drive_path, 'wb') as f:
        pickle.dump(segments, f)

# Function to split the permutations of hyperparameters into 6 segments
def split_hyperparams_grid(hyperparams_grid):

    # Calculate the total number of combinations
    total_combinations = 1
    for param_list in hyperparams_grid.values():
        total_combinations *= len(param_list)

    # Split the combinations into 6 segments
    segment_size = total_combinations // 6
    segments = [dict() for _ in range(6)]
    current_segment = 0
    current_combinations = 0

    for params in islice(ParameterGrid(hyperparams_grid), total_combinations):
        if current_combinations >= segment_size:
            current_segment += 1
            current_combinations = 0

        if current_segment >= 6:
            break

        segments[current_segment][current_combinations] = params
        current_combinations += 1

    return segments

# Define the hyperparameters grid
hyperparams_grid = {
    'vector_size': [50, 100, 125, 150, 175, 200, 250, 275, 300],
    'window': [1, 2, 3, 4, 5, 6],
    'min_count': [1, 2, 3, 4, 5],
    'epochs': [3, 5, 8, 10, 15, 20]
}

# Get the segments
segments = split_hyperparams_grid(hyperparams_grid)

# Saves the segments to google drive
save_segments_to_drive(segments, drive_segments_path)

# Prints each segment with the permutations of hyperparameters
for i, segment in enumerate(segments):
    print(f"Segment {i+1} - Combinations: {len(segment)}")
    print(segment)

## Loads the segments split into colabs from google drive

In [None]:
# Define the path to the saved segments file in Google Drive
drive_segments_path = '/content/drive/My Drive/segments.pkl'

# Function to load the segments
def load_segments_from_drive(drive_path):
    with open(drive_path, 'rb') as f:
        segments = pickle.load(f)
    return segments

# Load the segments
segments = load_segments_from_drive(drive_segments_path)


# Print segments to check if it has loaded back in correctly
for i, segment in enumerate(segments):
    print(f"Segment {i+1}")
    print(segment)

## Test 1 for hyperparameters (only expected filename)

In [None]:
# Not all the urls will be used for the final model (this is just to keep models that have been trained before stored so they are not lost)

###################
#TEST 1 RUNTHROUGHS
###################
# Creating the url's for each model segment for runthrough 1 (65% threshold) for test 1
final_model_path_segment1 = '/content/drive/My Drive/module_info_doc2vec_model_segment1' ## Best of segment: 33% success, parameters {'epochs': 3, 'min_count': 2, 'vector_size': 200, 'window': 5}
final_model_path_segment2 = '/content/drive/My Drive/module_info_doc2vec_model_segment2' ## Best of segment: 53% success, parameters {'epochs': 5, 'min_count': 5, 'vector_size': 100, 'window': 6}
final_model_path_segment3 = '/content/drive/My Drive/module_info_doc2vec_model_segment3' ## Best of segment: 53% success, parameters {'epochs': 8, 'min_count': 4, 'vector_size': 200, 'window': 6}
final_model_path_segment4 = '/content/drive/My Drive/module_info_doc2vec_model_segment4' ## Best of segment: 53% success, parameters {'epochs': 10, 'min_count': 1, 'vector_size': 100, 'window': 5}
final_model_path_segment5 = '/content/drive/My Drive/module_info_doc2vec_model_segment5' ## Best of segment: 40% success, parameters {'epochs': 15, 'min_count': 2, 'vector_size': 150, 'window': 1}
final_model_path_segment6 = '/content/drive/My Drive/module_info_doc2vec_model_segment6' ## Best of segment: 40% success, parameters {'epochs': 20, 'min_count': 1, 'vector_size': 175, 'window': 3}

# Creating the url's for each model segment for runthrough 2 (75% threshold) for test 1
final_model_path_segment1_run2 = '/content/drive/My Drive/module_info_doc2vec_model_segment1_run2' ## Best of segment: 40% success, parameters {'epochs': 3, 'min_count': 5, 'vector_size': 300, 'window': 5}
final_model_path_segment2_run2 = '/content/drive/My Drive/module_info_doc2vec_model_segment2_run2' ## Best of segment: 47% success, parameters {'epochs': 5, 'min_count': 4, 'vector_size': 150, 'window': 3}
final_model_path_segment3_run2 = '/content/drive/My Drive/module_info_doc2vec_model_segment3_run2' ## Best of segment: 47% success, parameters {'epochs': 8, 'min_count': 2, 'vector_size': 125, 'window': 2}
final_model_path_segment4_run2 = '/content/drive/My Drive/module_info_doc2vec_model_segment4_run2' ## Best of segment: 47% success, parameters {'epochs': 10, 'min_count': 3, 'vector_size': 125, 'window': 4}
final_model_path_segment5_run2 = '/content/drive/My Drive/module_info_doc2vec_model_segment5_run2' ## Best of segment: 40% success, parameters {'epochs': 15, 'min_count': 4, 'vector_size': 150, 'window': 3}
final_model_path_segment6_run2 = '/content/drive/My Drive/module_info_doc2vec_model_segment6_run2' ## Best of segment: 40% success, parameters {'epochs': 20, 'min_count': 1, 'vector_size': 300, 'window': 4}

# Creating the url's for each model segment for runthrough 3 (85% threshold) for test 1
final_model_path_segment1_run3 = '/content/drive/My Drive/module_info_doc2vec_model_segment1_run3' ## Best of segment: 27% success, parameters {'epochs': 3, 'min_count': 1, 'vector_size': 150, 'window': 5}
final_model_path_segment2_run3 = '/content/drive/My Drive/module_info_doc2vec_model_segment2_run3' ## Best of segment: 47% success, parameters {'epochs': 5, 'min_count': 4, 'vector_size': 275, 'window': 3}
final_model_path_segment3_run3 = '/content/drive/My Drive/module_info_doc2vec_model_segment3_run3' ## Best of segment: 47% success, parameters {'epochs': 8, 'min_count': 1, 'vector_size': 150, 'window': 6}
final_model_path_segment4_run3 = '/content/drive/My Drive/module_info_doc2vec_model_segment4_run3' ## Best of segment: 47% success, parameters {'epochs': 10, 'min_count': 4, 'vector_size': 275, 'window': 5}
final_model_path_segment5_run3 = '/content/drive/My Drive/module_info_doc2vec_model_segment5_run3' ## Best of segment: 40% success, parameters {'epochs': 15, 'min_count': 5, 'vector_size': 275, 'window': 6}
final_model_path_segment6_run3 = '/content/drive/My Drive/module_info_doc2vec_model_segment6_run3' ## Best of segment: 33% success, parameters {'epochs': 20, 'min_count': 4, 'vector_size': 275, 'window': 5}




def run_segment(segment):
    # Define user queries and expected filename letters
    user_queries = ["Mathematics", "Physics", "Show me law modules", "Spanish modules", "Art and Design", "Geography", "Computer Science", "I want to see mechanical engineering modules", "Politics modules", "English course modules", "Anatomy course modules", "Nursing", "Architecture modules", "Biology", "Psychology"]
    expected_filename_letters = [["MA"], ["PH"], ["LW"], ["PS"], ["DJ"], ["GE"], ["CS"], ["ME"], ["PO"], ["EN"], ["CA"], ["NU"], ["AR"], ["BS"], ["PY"]]

    # Initialise MultiLabelBinarizer
    mlb = MultiLabelBinarizer()

    best_accuracy = 0
    best_params = None
    number_of_iterations = 0

    for params in segment.values():

        number_of_iterations += 1
        # Clears the output every iteration to avoid memory issues
        display.clear_output(wait=True)

        print("=" * 50)
        print("Iteration:", number_of_iterations)
        print("Parameters:", params )
        print("")

        # Extract the current hyperparameters
        vector_size = params['vector_size']
        window = params['window']
        min_count = params['min_count']
        epochs = params['epochs']

        # Initialise and train Doc2Vec model with the current hyperparameters
        model = train_doc2vec_model(documents, vector_size=vector_size, window=window, min_count=min_count, epochs=epochs)

        # Call all functions to reinsert vectorised documents into tables.
        KEYSPACE_NAME = 'tables_home'
        TABLE_NAME = 'Modules'

        vectorized_docs = vectorize_existing_documents(model, existing_data_folder)

        # Drops previous table to avoid duplication issues
        drop_table_query = f"DROP TABLE IF EXISTS {KEYSPACE_NAME}.{TABLE_NAME}"
        execute_query_with_retry_vec(drop_table_query)

        create_table_query = f"""
        CREATE TABLE IF NOT EXISTS {KEYSPACE_NAME}.{TABLE_NAME} (
            filename TEXT PRIMARY KEY,
            doc_vector LIST<FLOAT>
        )
        """
        execute_query_with_retry_vec(create_table_query)

        index_creation_query = f"""
        CREATE CUSTOM INDEX IF NOT EXISTS semantic_search_index ON {KEYSPACE_NAME}.{TABLE_NAME} (doc_vector) USING 'StorageAttachedIndex'
        """
        execute_query_with_retry_vec(index_creation_query)

        insert_vectorized_docs(session, vectorized_docs, batch_size=15)

        # Retrieve relevant documents using semantic search for each user query
        all_predictions = []
        for query in user_queries:
            query_vector = vectorize_user_query(model, query)
            relevant_filenames = semantic_search_and_display(session, query_vector, threshold=0.85, top_results=5)
            filename_letters = [filename[:2] for filename in relevant_filenames]
            all_predictions.append(filename_letters)

        # Encode expected filename letters and predictions using MultiLabelBinarizer
        expected_labels = mlb.fit_transform(expected_filename_letters)
        predicted_labels = mlb.transform(all_predictions)

        # Evaluate model based on accuracy score for all user queries
        avg_accuracy = accuracy_score(expected_labels, predicted_labels)

        print("average accuracy", avg_accuracy)
        print("")

        # Update best parameters if necessary and save the current best model for that segment
        if avg_accuracy > best_accuracy:
            best_accuracy = avg_accuracy
            best_params = params
            model.save(final_model_path_segment6_run3)

        del model

    return best_params, best_accuracy


# Run the chosen segment
best_params_segment, best_accuracy_segment = run_segment(segments[5])

print("Best hyperparameters found in the segment:")
print(best_params_segment)
print("Accuracy:", best_accuracy_segment)


## Test 2 for hyperparameters (with key words included along with expected filenames)



In [None]:

# Not all the urls will be used for the final model (this is just to keep models that have been trained before stored so they are not lost)

###################
#TEST 2 RUNTHROUGHS
###################
# Creating the url's for each model segment for runthrough 1 (65% threshold) for test 2
final_model_path_segment1_test2 = '/content/drive/My Drive/module_info_doc2vec_model_segment1_test2' ## Best of segment: 67% success, parameters {'epochs': 3, 'min_count': 2, 'vector_size': 100, 'window': 6}
final_model_path_segment2_test2 = '/content/drive/My Drive/module_info_doc2vec_model_segment2_test2' ## Best of segment: 83% success, parameters {'epochs': 5, 'min_count': 3, 'vector_size': 300, 'window': 4} ## Best model tested
final_model_path_segment3_test2 = '/content/drive/My Drive/module_info_doc2vec_model_segment3_test2' ## Best of segment: 81% success, parameters {'epochs': 8, 'min_count': 1, 'vector_size': 250, 'window': 6}
final_model_path_segment4_test2 = '/content/drive/My Drive/module_info_doc2vec_model_segment4_test2' ## Best of segment: 77% success, parameters {'epochs': 10, 'min_count': 5, 'vector_size': 200, 'window': 2}
final_model_path_segment5_test2 = '/content/drive/My Drive/module_info_doc2vec_model_segment5_test2' ## Best of segment: 65% success, parameters {'epochs': 15, 'min_count': 5, 'vector_size': 50, 'window': 1}
final_model_path_segment6_test2 = '/content/drive/My Drive/module_info_doc2vec_model_segment6_test2' ## Best of segment: 59% success, parameters {'epochs': 20, 'min_count': 1, 'vector_size': 100, 'window': 2}

# Creating the url's for each model segment for runthrough 2 (75% threshold) for test 2
final_model_path_segment1_test2_run2 = '/content/drive/My Drive/module_info_doc2vec_model_segment1_test2_run2' ## Best of segment: 68% success, parameters {'epochs': 3, 'min_count': 2, 'vector_size': 100, 'window': 6}
final_model_path_segment2_test2_run2 = '/content/drive/My Drive/module_info_doc2vec_model_segment2_test2_run2' ## Best of segment: 76% success, parameters {'epochs': 5, 'min_count': 3, 'vector_size': 50, 'window': 4}
final_model_path_segment3_test2_run2 = '/content/drive/My Drive/module_info_doc2vec_model_segment3_test2_run2' ## Best of segment: 69% success, parameters {'epochs': 8, 'min_count': 1, 'vector_size': 275, 'window': 5}
final_model_path_segment4_test2_run2 = '/content/drive/My Drive/module_info_doc2vec_model_segment4_test2_run2' ## Best of segment: 71% success, parameters {'epochs': 10, 'min_count': 5, 'vector_size': 200, 'window': 2}
final_model_path_segment5_test2_run2 = '/content/drive/My Drive/module_info_doc2vec_model_segment5_test2_run2' ## Best of segment: 60% success, parameters {'epochs': 15, 'min_count': 5, 'vector_size': 50, 'window': 2}
final_model_path_segment6_test2_run2 = '/content/drive/My Drive/module_info_doc2vec_model_segment6_test2_run2' ## Best of segment: 48% success, parameters {'epochs': 20, 'min_count': 4, 'vector_size': 50, 'window': 2}

# Creating the url's for each model segment for runthrough 3 (85% threshold) for test 2
final_model_path_segment1_test2_run3 = '/content/drive/My Drive/module_info_doc2vec_model_segment1_test2_run3' ## Best of segment: 57% success, parameters {'epochs': 3, 'min_count': 2, 'vector_size': 275, 'window': 6}
final_model_path_segment2_test2_run3 = '/content/drive/My Drive/module_info_doc2vec_model_segment2_test2_run3' ## Best of segment: 69% success, parameters {'epochs': 5, 'min_count': 3, 'vector_size': 300, 'window': 4}
final_model_path_segment3_test2_run3 = '/content/drive/My Drive/module_info_doc2vec_model_segment3_test2_run3' ## Best of segment: 57% success, parameters {'epochs': 8, 'min_count': 5, 'vector_size': 300, 'window': 3}
final_model_path_segment4_test2_run3 = '/content/drive/My Drive/module_info_doc2vec_model_segment4_test2_run3' ## Best of segment: 57% success, parameters {'epochs': 10, 'min_count': 5, 'vector_size': 300, 'window': 2}
final_model_path_segment5_test2_run3 = '/content/drive/My Drive/module_info_doc2vec_model_segment5_test2_run3' ## Best of segment: 31% success, parameters {'epochs': 15, 'min_count': 1, 'vector_size': 200, 'window': 4}
final_model_path_segment6_test2_run3 = '/content/drive/My Drive/module_info_doc2vec_model_segment6_test2_run3' ## Best of segment: 20% success, parameters {'epochs': 20, 'min_count': 3, 'vector_size': 175, 'window': 5}


def run_segment(segment):

    # Define user queries and expected filename letters
    user_queries = ["Mathematics", "Physics", "Show me law modules", "Spanish modules", "Art and Design", "Geography", "Computer Science", "I want to see mechanical engineering modules", "Politics modules", "English course modules", "Anatomy course modules", "Nursing", "Architecture modules", "Biology", "Psychology"]
    expected_filename_letters = ["MA"] * 5 + ["PH"] * 5 + ["LW"] * 5 + ["PS"] * 5 + ["DJ"] * 5 + ["GE"] * 5 + ["CS"] * 5 + ["ME"] * 5 + ["PO"] * 5 + ["EN"] * 5 + ["CA"] * 5 + ["NU"] * 5 + ["AR"] * 5 + ["BS"] * 5 + ["PY"] * 5
    expected_filename_letters_positions = ["MA", "PH", "LW", "PS", "DJ", "GE", "CS", "ME", "PO", "EN", "CA", "NU", "AR", "BS", "PY"]

    # Initialize MultiLabelBinarizer
    mlb = MultiLabelBinarizer()

    best_accuracy = 0
    best_params = None
    number_of_iterations = 0

    for params in segment.values():

        number_of_iterations += 1
        # Clears the output every iteration to avoid memory issues
        display.clear_output(wait=True)

        print("=" * 50)
        print("Iteration:", number_of_iterations)
        print("Parameters:", params )
        print("Best Accuracy so far", best_accuracy)
        print("")

        # Extract the current hyperparameters
        vector_size = params['vector_size']
        window = params['window']
        min_count = params['min_count']
        epochs = params['epochs']

        # Initialise and train Doc2Vec model with the current hyperparameters
        model = train_doc2vec_model(documents, vector_size=vector_size, window=window, min_count=min_count, epochs=epochs)

        # Call all functions to reinsert vectorised documents into tables.
        KEYSPACE_NAME = 'tables_home'
        TABLE_NAME = 'Modules'

        vectorized_docs = vectorize_existing_documents(model, existing_data_folder)

        # Drops previous table to avoid duplication issues
        drop_table_query = f"DROP TABLE IF EXISTS {KEYSPACE_NAME}.{TABLE_NAME}"
        execute_query_with_retry_vec(drop_table_query)

        create_table_query = f"""
        CREATE TABLE IF NOT EXISTS {KEYSPACE_NAME}.{TABLE_NAME} (
            filename TEXT PRIMARY KEY,
            doc_vector LIST<FLOAT>
        )
        """
        execute_query_with_retry_vec(create_table_query)

        index_creation_query = f"""
        CREATE CUSTOM INDEX IF NOT EXISTS semantic_search_index ON {KEYSPACE_NAME}.{TABLE_NAME} (doc_vector) USING 'StorageAttachedIndex'
        """
        execute_query_with_retry_vec(index_creation_query)

        insert_vectorized_docs(session, vectorized_docs, batch_size=15)

        # Retrieve relevant documents using semantic search for each user query

        excluded_words = ["module", "show", "me", "and", "i", "want", "to", "see"]
        all_predictions = []  # Initialise with an empty list


        for query in user_queries:
            query_vector = vectorize_user_query(model, query)
            relevant_filenames = semantic_search_and_display(session, query_vector, threshold=0.85, top_results=5)
            none_count = 0
            file_count = 0

            if relevant_filenames:

                # If less than 5 files are returned
                if len(relevant_filenames) < 5:
                   file_count = len(relevant_filenames)
                   while file_count < 5:
                        all_predictions.append("None")
                        file_count += 1

                   for filename in relevant_filenames:
                            document = next((doc for doc in documents if doc.tags[0] == filename), None)
                            if document:
                                document_text = ' '.join(document.words)
                                query_words = [word.lower() for word in query.split() if word.lower() not in excluded_words]

                                if any(word.lower() in document_text.lower() for word in query_words):
                                    query_position = user_queries.index(query)
                                    all_predictions.append(expected_filename_letters_positions[query_position])
                                else:
                                    all_predictions.append(filename[:2])


                # If all 5 results are returned
                else:
                    for filename in relevant_filenames:
                            document = next((doc for doc in documents if doc.tags[0] == filename), None)
                            if document:
                                document_text = ' '.join(document.words)
                                query_words = [word.lower() for word in query.split() if word.lower() not in excluded_words]

                                if any(word.lower() in document_text.lower() for word in query_words):
                                    query_position = user_queries.index(query)
                                    all_predictions.append(expected_filename_letters_positions[query_position])
                                else:
                                    all_predictions.append(filename[:2])

            # If no files are returned
            else:
                if none_count < 5:
                    while none_count < 5:
                        all_predictions.append("None")
                        none_count += 1


        print(all_predictions)
        print(expected_filename_letters)

        # Evaluate model based on accuracy score for all user queries
        avg_accuracy = accuracy_score(all_predictions, expected_filename_letters)

        print("average accuracy", avg_accuracy)
        print("")

        # Update best parameters if necessary and save the current best model for that segment
        if avg_accuracy > best_accuracy:
            best_accuracy = avg_accuracy
            best_params = params
            model.save(final_model_path_segment6_test2_run3)

        del model

    return best_params, best_accuracy

# Run the chosen segment
best_params_segment, best_accuracy_segment = run_segment(segments[5])

print("Best hyperparameters found in the segment:")
print(best_params_segment)
print("Accuracy:", best_accuracy_segment)


#Semantic Search function called (to show model works as intended within Colabs without Anvil interference)

In [None]:
# Load the trained model
final_model_path_segment2_test2 = '/content/drive/My Drive/module_info_doc2vec_model_segment2_test2'
loaded_model = Doc2Vec.load(final_model_path_segment2_test2)

KEYSPACE_NAME = 'tables_home'
TABLE_NAME = 'Modules'

NORMAL_TABLE_NAME = 'ModulesNonVectorized'

# Example usage:
query_text = "physics"
query_vector = vectorize_user_query(loaded_model, query_text)

# For the retry method
max_retries=10
retry_interval=5

def execute_query_with_retry():
    for attempt in range(max_retries):
        try:
            semantic_search_and_display(session, query_vector, threshold=0.65, top_results=5)
            print(f"Successfully executed query")
            break  # Successful, exit loop
        except Exception as e:
            print(f"Retrying...")
            if attempt < max_retries - 1:
                print(f"Retrying in {retry_interval} seconds...")
                time.sleep(retry_interval)

execute_query_with_retry()

Filename: PH31008.txt, Cosine Similarity: 0.9269089669960086
Original Document Text:
mathematical methods for physics module (ph31008) credits 15 module code ph31008 mathematical methods are essential for physics because they provide a powerful language for describing and analysing physical phenomena. in this module, you will cover the core mathematics which underpins much of the honourâ€™s physics programme. the module takes a more physical and less abstract approach to mathematics wherever possible and highlights real-world examples to help with your understanding of the topics. you can transfer the skills acquired from this module to several others. topics include: * vector calculus: line and surface integrals involving scalar or vector fields, gaussâ€™s divergence theorem, stokesâ€™ theorem, and greenâ€™s theorem * systems of linear equations in n dimensions * fourier transforms and the convolution theorem * the heat/diffusion equation * solutions of partial differential equations 

#Defining the semantic search function for the *webpage*

In [None]:
def semantic_search_and_display_web(session, query_vector, threshold, top_results):
    search_query = f"SELECT filename, doc_vector FROM {KEYSPACE_NAME}.{TABLE_NAME}"
    result = session.execute(search_query)

    # Normalize query vector
    query_vector_normalized = normalize([query_vector], norm='l2')[0]

    # Collect results and distances using cosine similarity
    search_results = []

    for i, row in enumerate(result):
        filename = row.filename
        doc_vector = row.doc_vector
        doc_vector_normalized = normalize([doc_vector], norm='l2')[0]
        sim = cosine_similarity([query_vector_normalized], [doc_vector_normalized])[0][0]

        # Only consider results with similarity above the threshold
        if sim >= threshold:
            search_results.append({'filename': filename, 'similarity': sim})

    # Sort results by similarity
    sorted_results = sorted(search_results, key=lambda x: x['similarity'], reverse=True)

    # Compile top results
    top_results_list = []
    for res in sorted_results[:top_results]:
        filename = res['filename']
        similarity = res['similarity']
        document_text = get_document_text_non_vectorized(session, filename)
        summarised_text = get_document_text_summarised(session, filename)

        if document_text and summarised_text:
            # Append result details to the list
            result_details = {
                'filename': filename,
                'similarity': similarity,
                'original_text': document_text,
                'summarised_text': summarised_text
            }
            top_results_list.append(result_details)

    return top_results_list

##Cell to interact with Anvil Webpage

In [None]:
# Connect to Anvil with the Uplink key
anvil.server.connect("server_WKP7MDP6G4T7TYNVGP4URKGM-4CHFUI6ZBU3AINRJ")

@anvil.server.callable
def semantic_search(query):
    print(f"Received query: '{query}'")

    query_vector = vectorize_user_query(loaded_model, query)

    # Execute the search and retrieve results
    search_results = semantic_search_and_display_web(session, query_vector, threshold=0.65, top_results=5)
    print(f"Search results: {search_results}")

    formatted_results = []
    for result in search_results:
        formatted_result = f"{result['filename']} - Similarity: {result['similarity']}\nOriginal Text: {result['original_text']}\n\nSummarised Text: {result['summarised_text']}\n\n"
        formatted_results.append(formatted_result)

    return formatted_results

# Keep the notebook connected to Anvil indefinitely
anvil.server.wait_forever()

Connecting to wss://anvil.works/uplink
Anvil websocket open
Connected to "Default Environment" as SERVER
Received query: 'mathematics'
Search results: [{'filename': 'PH12004.txt', 'similarity': 0.9097860079487334, 'original_text': 'light and matter module (ph12004) credits 10 module code ph12004 in this module, you will cover topics which describe different models of light and its interactions with the material world. you will cover an introduction to modern physics, including optics, relativity, and quantum physics. you will be introduced to a wide selection of fundamental physics and real-world applications. the module combines workshops, laboratories, and problem classes and is designed for students who wish to specialise in physics at higher levels of study. however, the module is accessible to students with appropriate high school-level mathematics and physics knowledge. topics include: * refraction, reflection, and interference of light and simple optical instrumentation, all wit

## Evaluation step (not used anymore as this version of testing the hyperparameters was scrapped)

In [None]:
# Load your pre-trained model
model_path = '/content/drive/My Drive/module_info_doc2vec_model_v1'
loaded_model = Doc2Vec.load(model_path)

train_docs, test_docs = train_test_split(documents, test_size=0.2, random_state=42)

def top_k_accuracy(true_indices, predicted_indices):
    correct_predictions = 0
    for true_index, top_predictions in zip(true_indices, predicted_indices):
        if true_index in top_predictions:
            correct_predictions += 1
    return correct_predictions / len(true_indices)

def mean_reciprocal_rank(true_indices, predicted_indices):
    reciprocal_ranks = []
    for true_index, top_predictions in zip(true_indices, predicted_indices):
        if true_index in top_predictions:
            reciprocal_ranks.append(1 / (top_predictions.index(true_index) + 1))
        else:
            reciprocal_ranks.append(0)
    if not reciprocal_ranks:
        return 0.0
    return sum(reciprocal_ranks) / len(reciprocal_ranks)

def precision_at_k(true_indices, predicted_indices):
    correct_predictions = 0
    total_predictions = 0
    for true_index, top_predictions in zip(true_indices, predicted_indices):
        if true_index in top_predictions:
            correct_predictions += 1
        if top_predictions:
            total_predictions += 1
    if total_predictions == 0:
        return 0.0
    return correct_predictions / total_predictions


def get_ground_truth_indices(test_docs):
    return [pair[1][0] for pair in test_docs]

def get_predicted_indices(model, test_docs):
    predicted_indices = []

    for query, relevant_document_index in test_docs:
        # Always treat the query as a list of tokens
        query_vector = vectorize_user_query(loaded_model, ' '.join(query))

        # Perform semantic search to get similar documents
        similar_documents = semantic_search_and_display(session, query_vector, threshold=0.8, top_results=5)

        print("Query:", query)
        print("Filenames in similar_documents:", similar_documents)

        # Append all filenames obtained from the semantic search
        predicted_indices.append(similar_documents)

    print("Filenames in similar_documents:", predicted_indices)
    return predicted_indices

def evaluate_model(model, test_docs):
    # Perform semantic search on the test_docs
    # Extract ground truth indices for evaluation
    true_indices = get_ground_truth_indices(test_docs)

    # For each query, get the predicted indices using semantic_search_and_display
    predicted_indices = get_predicted_indices(model, test_docs)

    # Compute evaluation metrics
    top_k_accuracy_value = top_k_accuracy(true_indices, predicted_indices)
    mean_reciprocal_rank_value = mean_reciprocal_rank(true_indices, predicted_indices)
    precision_at_k_value = precision_at_k(true_indices, predicted_indices)

    # Print evaluation results
    print("")
    print("Evaluation Results:")
    print("=" * 50)
    print("Top-k Accuracy: {}".format(top_k_accuracy_value))
    print("Mean Reciprocal Rank: {}".format(mean_reciprocal_rank_value))
    print("Precision: {}".format(precision_at_k_value))
    print("=" * 50)

# Evaluate the pre-trained model
evaluate_model(loaded_model, test_docs)
