In [84]:
import os
import tensorflow as tf

# Install the requests module
# !pip install requests

import requests
import xml.etree.ElementTree as ET
import pandas as pd

In [85]:
# device selection logic
if tf.config.list_physical_devices('GPU'):
    print("Using GPU")
    device = '/GPU:0'
else:
    print("Using CPU")
    device = '/CPU:0'

Using GPU


In [3]:
os.environ["OPENAI_API_KEY"] = ""
# os.environ["HUGGINGFACEHUB_API_TEKEN"] = ""

In [4]:
def get_sitemap_urls(sitemap_url):
    response = requests.get(sitemap_url)
    sitemap = response.content
    root = ET.fromstring(sitemap)
    urls = [url.text for url in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}loc')]
    return urls

In [5]:
# Get URLs from sitemap
sitemap_url = 'https://www.hull.ac.uk/sitemap.xml'
urls = get_sitemap_urls(sitemap_url)

In [16]:
from langchain.document_loaders import UnstructuredURLLoader
while tf.device(device):
    loaders = UnstructuredURLLoader(urls=urls)
    data = loaders.load()

### save the data

In [18]:
# data_str = '\n'.join(doc.page_content for doc in data)

# # Save the string to a text file
# with open('data.txt', 'w') as file:
#     file.write(data_str)

import json
# Prepare data for saving
data_to_save = [{'page_content': doc.page_content, 'metadata': doc.metadata} for doc in data]

# Save data to a JSON file
with open('data.json', 'w') as file:
    json.dump(data_to_save, file)

### Load data

In [20]:
# Load data from the JSON file
with open('data.json', 'r') as file:
    data = json.load(file)

In [30]:
from langchain.text_splitter import CharacterTextSplitter

# Define the Document class
class Document:
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata

# Initialize the text splitter
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=10000, chunk_overlap=2000)

with open('data.json', 'r') as file:
    data = json.load(file)

# Convert dictionaries to Document objects
docs = [Document(doc['page_content'], doc['metadata']) for doc in data]

# Use the text splitter to split documents
split_docs = text_splitter.split_documents(docs)

# Load embeddings
# embeddings_array = np.load('embeddings.npy')

In [46]:
split_docs[1660]

Document(page_content='500\nThe page you were visiting has generated an error. You could go straight to our home page?', metadata={'source': 'https://www.hull.ac.uk/work-with-us/research/groups/positron-emission-tomography-research-centre.aspx'})

In [33]:
import pickle
import faiss
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

In [46]:
# embeddings = OpenAIEmbeddings(model="text-embeddings-ada-002", 
#                              openai_api_type=None,
#                               chunk_size=1000,
#                               max_retries=6)

In [48]:
# import os
# print(os.getenv('OPENAI_API_KEY'))

In [None]:
# vectorStore_openAI = FAISS.from_documents(docs, embeddings)
# with open("faiss_store_openai.pkl", "wb") as f:
#     pickle.dump(vectorStore_openAI, f)

In [None]:
# with open("faiss_store_openai.pkl", "rb") as f:
#     VectorStore = pickle.load(f)

In [49]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

# Load pre-trained model tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

with tf.device(device):
    # Function to handle texts longer than 512 tokens
    def generate_embeddings(text):
        stride = 128  # Overlap size
        max_len = 512  # Max token length for BERT
        tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_len)
        token_chunks = [tokens[i:i+max_len] for i in range(0, len(tokens), max_len-stride)]
        
        # Initialize a list to hold embeddings of each chunk
        chunk_embeddings = []
        
        for chunk in token_chunks:
            # Generate embeddings for each chunk
            with torch.no_grad():  # Disable gradient calculation for inference
                outputs = model(**chunk)
                chunk_embeddings.append(outputs.last_hidden_state.mean(1))
        
        # Combine chunk embeddings by averaging
        embeddings = torch.mean(torch.stack(chunk_embeddings), dim=0)
        return embeddings
    
    # Generate embeddings for each document
    embeddings = [generate_embeddings(doc.page_content) for doc in split_docs]
    
    # Convert embeddings to numpy array and save to disk
    embeddings_array = torch.stack(embeddings).numpy()
    np.save('bert_embeddings.npy', embeddings_array)

### Save Metadata

In [50]:
# Save metadata to a JSON file
metadata_list = [doc.metadata for doc in split_docs]
with open('metadata.json', 'w') as f:
    json.dump(metadata_list, f)

### Loading Embeddings and Metadata from Disk

In [51]:
# Load embeddings from the NumPy binary file
loaded_embeddings = np.load('bert_embeddings.npy')

# Load metadata from the JSON file
with open('metadata.json', 'r') as f:
    loaded_metadata = json.load(f)

### Convert embeddings to 2D numpy array

In [66]:
print(loaded_embeddings.shape)

(3460, 1, 768)


In [67]:
import faiss

# Reshape loaded_embeddings from (3460, 1, 768) to (3460, 768)
loaded_embeddings = loaded_embeddings.reshape(loaded_embeddings.shape[0], loaded_embeddings.shape[2])

# Ensure the embeddings are of type float32
if loaded_embeddings.dtype != np.float32:
    loaded_embeddings = loaded_embeddings.astype(np.float32)

# Initialize the FAISS index
dimension = loaded_embeddings.shape[1]  # Dimension of the embeddings
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for similarity

# Add embeddings to the index
index.add(loaded_embeddings)  # Now the index is ready for searching

In [77]:
# Convert user query to embedding
query_text = "How do i apply for phd?"  # Replace with the actual user query
query_embedding = generate_embeddings(query_text)

# Use FAISS to find top k nearest document embeddings to the query embedding
k = 5  # Number of nearest neighbors
D, I = index.search(query_embedding, k)

In [None]:
openai.api_key = ''

In [80]:
from openai import OpenAI
client = OpenAI(max_tokens=50)

for idx in I[0]:
    document_content = split_docs[idx].page_content  # Retrieve the content of the relevant document
    source_url = loaded_metadata[idx]['source']  # Retrieve the source URL of the relevant document
    
    # Construct the conversation
    response = client.chat.completions.create(
      model="gpt-3.5-turbo",
      messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": query_text},
        {"role": "assistant", "content": document_content},
        {"role": "user", "content": "Can you give me more information about this?"}
      ]
    )

    # Extract the generated answer from the response
    # Check if 'choices' is in the response and extract the content safely
    if 'choices' in response and response['choices']:
        answer = response['choices'][0]['message']['content']
    else:
        answer = 'No response generated'

    # Print or process the answer and the source URL
    print("Answer:", answer)
    print("Source:", source_url)

Answer: No response generated
Source: https://www.hull.ac.uk/faculties/subjects/features/computer-science-industry-placements.aspx
Answer: No response generated
Source: https://www.hull.ac.uk/work-with-us/more/media-centre/news/2022/studying-in-the-uk-as-an-international-student.aspx
Answer: No response generated
Source: https://www.hull.ac.uk/work-with-us/more/media-centre/news/2021/the-time-had-come-for-me-to-get-the-education-i-wanted.aspx
Answer: No response generated
Source: https://www.hull.ac.uk/faculties/subjects/features/graduate-q-and-a-grace-marner-bsc-biology.aspx
Answer: No response generated
Source: https://www.hull.ac.uk/clearing/student-finance.aspx


In [81]:
print(response)

ChatCompletion(id='chatcmpl-8mgjlsCKC6YQsleYoEQ1Joax2baoa', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="Of course! Applying for a PhD (Doctor of Philosophy) typically involves several steps. Here is a general overview:\n\n1. Research Your Field: Start by exploring different research areas and topics within your field of interest. Look for potential supervisors or research groups that align with your research interests.\n\n2. Contact Potential Supervisors: Reach out to professors or researchers who specialize in your area of interest. Introduce yourself, explain your research interests, and inquire about potential PhD opportunities in their research group. Building relationships with potential supervisors is crucial, as they can guide you through the application process.\n\n3. Prepare Your Application Materials: Once you have found a potential supervisor and research topic, you will need to prepare your application materials, whic

In [82]:
print("Answer:", answer)
print("Source:", source_url)  # This should be the URL of the document that was used to generate the context for the answer.


Answer: No response generated
Source: https://www.hull.ac.uk/clearing/student-finance.aspx
