In [1]:
from langchain.vectorstores.pgvector import PGVector
import pandas as pd
import numpy as np
from langchain.document_loaders import DataFrameLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores.pgvector import DistanceStrategy
from langchain.schema import Document

In [2]:
CONNECTION_STRING = "postgresql://postgres:123456@localhost:5433/wateraid"
df = pd.read_csv('Listings_Details.csv')
df.fillna("NA", inplace=True)
df.head()

Unnamed: 0,Listing URL,Name of Activity,Date,Location,Event Synopsis,Event Description,Registration Link,Activity Category
0,https://www.wateraid.org//uk/get-involved/givi...,Hire our Handwashing Hubs,,,Hire our contactless handwashing facilities to...,Frequent handwashing is one of the most effect...,,giving
1,https://www.wateraid.org//uk/get-involved/even...,Swim Serpentine,14 September 2024,"Hyde Park, London",Enjoy a late summer swim in the beautiful surr...,Take in the views with a leisurely half mile o...,https://forms.office.com/Pages/ResponsePage.as...,events
2,https://www.wateraid.org//uk/get-involved/teac...,World Water Day activities,,,Use World Water Day on 22 March to introduce y...,703 million people in the world – that's almos...,,teaching resources
3,https://www.wateraid.org//uk/get-involved/teac...,Educational resources from your local water co...,,,Discover the education opportunities your loca...,"In 1981, WaterAid was set up by members of the...",,teaching
4,https://www.wateraid.org//uk/get-involved/givi...,Muslim Faith Giving,,,"Around the world, many Muslim communities do n...",Clean water and good hygiene means families ca...,,giving


In [3]:
combined = []

for index, row in df.iterrows():
    text_to_embed = row[1] + ". Location is " + row[3] + ". " + row[4] + " " + row[5]
    combined.append(text_to_embed)

df['combined'] = combined

In [4]:
df.head()


Unnamed: 0,Listing URL,Name of Activity,Date,Location,Event Synopsis,Event Description,Registration Link,Activity Category,combined
0,https://www.wateraid.org//uk/get-involved/givi...,Hire our Handwashing Hubs,,,Hire our contactless handwashing facilities to...,Frequent handwashing is one of the most effect...,,giving,Hire our Handwashing Hubs. Location is NA. Hir...
1,https://www.wateraid.org//uk/get-involved/even...,Swim Serpentine,14 September 2024,"Hyde Park, London",Enjoy a late summer swim in the beautiful surr...,Take in the views with a leisurely half mile o...,https://forms.office.com/Pages/ResponsePage.as...,events,"Swim Serpentine. Location is Hyde Park, London..."
2,https://www.wateraid.org//uk/get-involved/teac...,World Water Day activities,,,Use World Water Day on 22 March to introduce y...,703 million people in the world – that's almos...,,teaching resources,World Water Day activities. Location is NA. Us...
3,https://www.wateraid.org//uk/get-involved/teac...,Educational resources from your local water co...,,,Discover the education opportunities your loca...,"In 1981, WaterAid was set up by members of the...",,teaching,Educational resources from your local water co...
4,https://www.wateraid.org//uk/get-involved/givi...,Muslim Faith Giving,,,"Around the world, many Muslim communities do n...",Clean water and good hygiene means families ca...,,giving,Muslim Faith Giving. Location is NA. Around th...


In [5]:
# page_content_column is the column name in the dataframe to create embeddings for
loader = DataFrameLoader(df, page_content_column = 'combined')
docs = loader.load()

In [6]:
embeddings = HuggingFaceEmbeddings()

db = PGVector.from_documents(
    documents= docs,
    embedding = embeddings,
    collection_name= "listings_documents",
    distance_strategy = DistanceStrategy.COSINE,
    connection_string=CONNECTION_STRING)

  from tqdm.autonotebook import tqdm, trange


In [8]:
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from ibm_watson_machine_learning.foundation_models.utils.enums import DecodingMethods
from ibm_watson_machine_learning.foundation_models.extensions.langchain import WatsonxLLM

wxa_url = "https://eu-gb.ml.cloud.ibm.com"
# wxa_api_key = "ew9FSpkxGdAS91FvT_t4CjC30JYF-vRZayqRMDs7Afsb" # old because hit monthly token limit
wxa_api_key = "7AMq7kpxXp8tJTMo-_qj59FhEcC5ewkTS_pAPfOgAjFz" 
# wxa_project_id = "573a5af9-21d8-414c-90ea-ca983ffa683c" # old because hit monthly token limit
wxa_project_id = "f65d106f-b186-418b-8c00-f67cd14f95cf"

# Set up Watsonx Granite LLM Model

def LLM_set_up():
    parameters = {
        GenParams.DECODING_METHOD: DecodingMethods.GREEDY,
        GenParams.MIN_NEW_TOKENS: 1,
        GenParams.MAX_NEW_TOKENS: 500
    }


    model = Model(
        # model_id=ModelTypes.GRANITE_13B_INSTRUCT_V2,
        model_id="ibm/granite-13b-instruct-v2",
        params=parameters,
        credentials={
            "url": wxa_url,
            "apikey": wxa_api_key
        },
        project_id=wxa_project_id
    )

    granite_llm_ibm = WatsonxLLM(model=model)

    return granite_llm_ibm

In [14]:
# TESTING USAGE / QUERY / SEARCH OF VECTOR DATABASE 
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

store = PGVector(
    connection_string=CONNECTION_STRING, 
    embedding_function=embeddings, 
    collection_name="listings_documents",
    distance_strategy=DistanceStrategy.COSINE
)

query = "I am based in Newcastle. I like to hike and run in my free time. I have experiences as a teacher. What activities are recommended for me based on the activities in the context provided - give the specific name of the activity? Give a reason why each activity is recommended for me."

# docs =  store.similarity_search(query, k=3)


# for doc in docs:
#     doc_content = doc.page_content
#     print(doc_content)

#     doc_metadata = doc.metadata
#     print(doc_metadata['Name of Activity'])
#     print(doc_metadata['Listing URL'])


retriever = store.as_retriever(search_kwargs={"k": 6})

prompt = PromptTemplate(template="""

    Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
    {context}

    ##Question:{question} \n\

    ##Top 3 recommnedations of activities:\n""",input_variables=["context","question"])

chain_type_kwargs = {"prompt": prompt}

qa = RetrievalQA.from_chain_type(llm=LLM_set_up(), chain_type="stuff",
                                        retriever=retriever,
                                        chain_type_kwargs=chain_type_kwargs,
                                        verbose=True)

retrieved_docs = retriever.get_relevant_documents(query)

links = ""
for doc in retrieved_docs:
    doc_metadata = doc.metadata
    temp = doc_metadata['Listing URL'] + "\n"
    links += temp

res = qa.run(query)

print(res)
print(links)




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
- Hike - because I like to hike and run in my free time and I have experiences as a teacher.
- Run - because I like to hike and run in my free time and I have experiences as a teacher.
- Swim Marathon - because I like to hike and run in my free time and I have experiences as a teacher.
https://www.wateraid.org//uk/get-involved/events/the-severn-trent-mountain-challenge
https://www.wateraid.org//uk/get-involved/events/ridelondon-100
https://www.wateraid.org//uk/get-involved/events/great-north-run
https://www.wateraid.org//uk/get-involved/volunteering/community-events-team-volunteer
https://www.wateraid.org//uk/get-involved/events/swim-marathon
https://www.wateraid.org//uk/get-involved/volunteering/volunteer-at-an-event

