In [1]:
!pip install transformers torch faiss-cpu



In [2]:
!pip install pipeline



In [3]:
import faiss
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel

In [4]:
# Load the model and tokenizer for encoding documents and queries
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
# Set the padding token
tokenizer.pad_token = tokenizer.eos_token

In [6]:
# Move the model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
import transformers

# Initialize the text generation pipeline
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    tokenizer=tokenizer,
    device_map="auto",
)

2024-07-19 10:04:14.920826: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-19 10:04:14.961893: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
# Define the terminators for generation
terminators = [
    pipeline.tokenizer.eos_token_id,
    #pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

In [9]:
# Function to generate a response
def generate_response(query, SubjectEntityID, relation, Subject, prompt2):
    # Retrieve relevant documents

    wiki, threshold_cosine = load_wiki(SubjectEntityID, relation, prompt2)
    relevant_docs = "Using this context to answer the question: "+ wiki
    
    # Combine the query with the retrieved documents
    context = query

    # Create the input messages for the generation model
    messages = [
        {"role": "system", "content": relevant_docs},
        {"role": "system", "content": "You are a chatbot who always responds an answer in english with comma and no explanation. If u don't know the answer, answer None"},
        {"role": "user", "content": context},
    ]

    # Generate the response
    outputs = pipeline(
        messages,
        max_new_tokens=3000,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.1,
        top_p=0.9,
        pad_token_id = pipeline.tokenizer.eos_token_id
    )
    return outputs[0]["generated_text"], wiki, threshold_cosine

In [11]:
def info_valid(text, prompt):

    query = '''Is this information "''' + text + '''" can give an answer to question: ''' + prompt + " ?"
    # Create the input messages for the generation model
    messages = [
        {"role": "system", "content": "You are a chatbot who always responds an answer with 1 if Yes or 0 if No only"},
        {"role": "user", "content": query}
    ]

    # Generate the response
    outputs = pipeline(
        messages,
        max_new_tokens=1,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.1,
        top_p=0.9,
        pad_token_id = pipeline.tokenizer.eos_token_id
    )
    return outputs[0]["generated_text"]

In [12]:
import json
with open('test.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

In [14]:
from SPARQLWrapper import SPARQLWrapper, JSON

def load_url(QID):
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setQuery(f"""
        SELECT ?finalLink 
        WHERE {{
            BIND(wd:{QID} AS ?item)
            OPTIONAL {{
                ?enwiki schema:about ?item ;
                        schema:isPartOf <https://en.wikipedia.org/> .
                BIND(?enwiki AS ?finalLink)
            }}
            OPTIONAL {{
                ?otherwiki schema:about ?item ;
                        schema:isPartOf ?site .
                FILTER (?site != <https://en.wikipedia.org/>)
                BIND(?otherwiki AS ?finalLink)
            }}
            FILTER(BOUND(?finalLink))
        }}
        ORDER BY ?site
        LIMIT 1
    """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    if results["results"]["bindings"]:
        return results["results"]["bindings"][0]["finalLink"]["value"]
    else:
        return None

In [15]:
from sentence_transformers import SentenceTransformer, util
model2 = model = SentenceTransformer('all-MiniLM-L6-v2')



In [16]:
# Web Scraping

import pandas as pd
from bs4 import BeautifulSoup
import requests
from sentence_transformers import SentenceTransformer, util
import time

def load_wiki(QID, relation, prompt):
    resource = load_url(QID)
    if resource is None:
        return "Please use your own llm knowledge.", [0,0,0]
    
    res = requests.get(resource)

    if res.status_code != 200:
        return "Please use your own llm knowledge.", [0,0,0]
        
    soup = BeautifulSoup(res.content, 'html.parser')
    heading = soup.find('h1', {'id': 'firstHeading'})
    if heading != None:
        h = heading.get_text()

    documents = []

    global model2
    prompt_embedding = model2.encode(prompt, convert_to_tensor=True)
    
    urls = find_link(resource)
    url_name = []
    for i in range(len(urls)):
        name = urls[i].replace("https://en.wikipedia.org/wiki/", "").replace("_", " ")
        url_name.append(name)
    url_name_embedding = model2.encode(url_name, convert_to_tensor=True)
    url_cosine_similarities = util.pytorch_cos_sim(prompt_embedding, url_name_embedding)
    url_cosine_similarities_list = url_cosine_similarities.flatten().tolist()
    
    paragraphs = scrap_para(resource, prompt)
    chunk_size = 4500
    if paragraphs != None:
        for j in range(len(paragraphs)):
            if len(paragraphs[j]) > chunk_size:
                for k in range(0, len(paragraphs[j]), chunk_size):
                    documents.append(paragraphs[j][k:k+chunk_size])
            else:
                documents.append(paragraphs[j])

    infobox = scrap_infobox(resource, prompt)
    if infobox != None:
        for j in range(len(infobox)):
            documents.append(infobox[j])
        
    table = scrap_table(resource, prompt)
    if table != None:
        for j in range(len(table)):
            documents.append(table[j])


    
    for i in range(len(urls)):
        if url_cosine_similarities_list[i] >= 0.5:
            time.sleep(0.01)
            resq = requests.get(urls[i])
            soup = BeautifulSoup(resq.content, 'html.parser')
        
            head = soup.find('h1', {'id': 'firstHeading'})
            if head != None:
                headings = head.get_text()
                
            para = scrap_para(urls[i], prompt)
            if para != None:
                for j in range(len(para)):
                    if len(para[j]) > chunk_size:
                        for k in range(0, len(para[j]), chunk_size):
                            #print(para[j][k:k+chunk_size])
                            documents.append(para[j][k:k+chunk_size])
                    else:
                        #print(para[j])
                        documents.append(para[j])

            infbox = scrap_infobox(urls[i], prompt)
            if infbox != None:
                for j in range(len(infbox)):
                    documents.append(infbox[j])

            tabl = scrap_table(urls[i], prompt)
            if tabl != None:
                for j in range(len(tabl)):
                    documents.append(tabl[j])
    
    document_embeddings = model2.encode(documents, convert_to_tensor=True)
    
    cosine_similarities = util.pytorch_cos_sim(prompt_embedding, document_embeddings)

    cosine_similarities_list = cosine_similarities.flatten().tolist()

    sorted_cosine = sorted(cosine_similarities_list, reverse=True)

    
    maxlen = 20
    if len(sorted_cosine) < maxlen:
        maxlen = len(sorted_cosine)

    threshold_cosine = []
    threshold_cosine.append(sorted_cosine[0])
    threshold_cosine.append(sorted_cosine[maxlen-1])
    threshold_cosine.append(maxlen)
    
    index_list = []
    for i in range(maxlen):
        index = cosine_similarities_list.index(sorted_cosine[i])
        index_list.append(index)

    final_doc = ''
    for i in range(maxlen):
        final_doc += documents[index_list[i]] + "\n"
        
    return final_doc, threshold_cosine

In [17]:
def scrap_para(url, prompt):

    response = requests.get(url)

    if response.status_code == 200:
        
        soup = BeautifulSoup(response.content, "html.parser")

        paragraphs = soup.find_all('p')

        chunk_size = 4500
        valid_list = []
        for i, paragraph in enumerate(paragraphs):
            if len(paragraph.get_text()) > chunk_size:
                temp = remove_hyperlinks(paragraph.get_text())
                for k in range(0, len(temp), chunk_size):
                    if info_valid(temp[k:k+chunk_size], prompt):
                        valid_list.append(temp[k:k+chunk_size])
            else:
                if info_valid(paragraph.get_text(), prompt):
                    valid_list.append(remove_hyperlinks(paragraph.get_text()))
                
    else:
        return None


    return valid_list

In [18]:
def scrap_infobox(url, prompt):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    contents = ''
    # Wiki infobox
    datatable = soup.find("table", {"class": "infobox"})
    if datatable is not None:
        for tdcol in datatable.select('th[colspan], td[colspan]'):
            if tdcol.parent:
                tdcol.parent.decompose()

        try:
            tables = pd.read_html(str(datatable))
            if tables:  # Check if any tables were found
                df = tables[0]
                if not df.empty:
                    if isinstance(df.columns, pd.MultiIndex):
                        df.columns = df.columns.get_level_values(0)
                    df.columns = df.columns.astype(str)  # Convert column names to string
                    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
                    datatable_string = ' '.join(df.astype(str).values.flatten())
                    contents += ' ' + datatable_string
        except ValueError:
            # No tables found
            pass
            
    chunk_size = 4500
    valid_list = []
    contents = remove_hyperlinks(contents)
    if len(contents) > chunk_size:
        for k in range(0, len(contents), chunk_size):
            if info_valid(contents[k:k+chunk_size], prompt):
                valid_list.append(contents[k:k+chunk_size])
    else:
        if info_valid(contents, prompt):
            valid_list.append(contents)
            
    if contents != "":
        return valid_list
    else:
        return None

In [19]:
def scrap_table(url, prompt):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    
    contents = ''
    # Wiki table
    wikitable = soup.find("table", {"class": "wikitable"})
    if wikitable is not None:
        for tdcol in wikitable.select('th[colspan], td[colspan]'):
            if tdcol.parent:
                tdcol.parent.decompose()
        try:
            tables = pd.read_html(str(wikitable))
            if tables:  # Check if any tables were found
                df = tables[0]
                if not df.empty:
                    if isinstance(df.columns, pd.MultiIndex):
                        df.columns = df.columns.get_level_values(0)
                    df.columns = df.columns.astype(str)  # Convert column names to string
                    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
                    
                    # Concatenate column names with each row's values
                    rows = []
                    for index, row in df.iterrows():
                        row_string = ' '.join([f"{col}: {row[col]}" for col in df.columns])
                        rows.append(row_string)
                    wikitable_string = ' '.join(rows)
                    
                    contents += ' ' + wikitable_string
        except ValueError:
            # No tables found
            pass
            
    chunk_size = 4500
    valid_list = []
    contents = remove_hyperlinks(contents)
    if len(contents) > chunk_size:
        for k in range(0, len(contents), chunk_size):
            if info_valid(contents[k:k+chunk_size], prompt):
                valid_list.append(contents[k:k+chunk_size])
    else:
        if contents != "" and info_valid(contents, prompt):
            valid_list.append(contents)
            
    if contents != "":
        return valid_list
    else:
        return None

In [20]:
def find_link(url):
    skip_url_list = ['https://', ':']
    res = requests.get(url)
    soup = BeautifulSoup(res.text, "html.parser")
    urls = []
    for link in soup.find_all("a"):
      url = link.get("href", "")
      if "/wiki/" in url and "https://" not in url:
        if not any(skip_url in url for skip_url in skip_url_list):
          url = "https://en.wikipedia.org" + url
          urls.append(url)

    urls = list(dict.fromkeys(list(set(urls))))
    return urls

In [21]:
import re

def remove_hyperlinks(text):
    # remove_hyperlinks
    output_string = re.sub(r'\[\d+\]', '', text)
    output_string = re.sub(r'\[.*?\]', '', output_string)
    output_string = re.sub(r'\(\d{4}–\d{4}\)', '', output_string)
    return output_string

In [22]:
# Prompt for Knowledge Extraction
def load_prompt(SubjectEntity, Relation):
    question = ''
    if Relation == 'countryLandBordersCountry':
        question += "Which countries share land borders with " + SubjectEntity + " with country name only with comma? If None, answer None."
        
    if Relation == 'personHasCityOfDeath':
        question += "What is the city of death of " + SubjectEntity + "? answering with one city name only with no explanation. If there is no place of death mentioned, answer None"

    if Relation == 'seriesHasNumberOfEpisodes':
        question += "How many total episodes of series " + SubjectEntity + " ? answering with only one number ?"

    if Relation == 'awardWonBy':
        question += "Provide a name only list of all award winners in " + SubjectEntity + " with no explanation and name with comma ?"

    if Relation == 'companyTradesAtStockExchange':
        question += "Which stock exchange does " + SubjectEntity + " trade on ? answering with no explanation and name with comma? If None, answer None."
    
    return question

In [23]:
# Prompt for Web Scraping
def load_prompt2(SubjectEntity, Relation):
    question = ''
    if Relation == 'countryLandBordersCountry':
        question += "Which country share land border with " + SubjectEntity + " ?"
        
    if Relation == 'personHasCityOfDeath':
        question += "What is the city of death of " + SubjectEntity + " ?"

    if Relation == 'seriesHasNumberOfEpisodes':
        question += "How many episodes does series " + SubjectEntity + " has ?" 

    if Relation == 'awardWonBy':
        question += "Who has won " + SubjectEntity + " ?"

    if Relation == 'companyTradesAtStockExchange':
        question += "Which stock exchange does " + SubjectEntity + " trade on ?"
    
    return question

In [26]:
# Finding answers
answer = {}
web_scrap = []
threshold_list = []
for i in range(len(data)):
    prompt = load_prompt(data[i]['SubjectEntity'], data[i]['Relation'])
    prompt2 = load_prompt2(data[i]['SubjectEntity'], data[i]['Relation'])

    response, docs, threshold_cosine = generate_response(prompt, data[i]['SubjectEntityID'], data[i]['Relation'], data[i]['SubjectEntity'], prompt2)
    answer[prompt] = response[3]['content']
    #print(response)
    web_scrap.append(docs)
    threshold_list.append(threshold_cosine)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
  tables = pd.read_html(str(datatable))
  tables = pd.read_html(str(wikitable))
  tables = pd.read_html(str(datatable))
  tables = pd.read_html(str(wikitable))
  tables = pd.read_html(str(wikitable))
  tables = pd.read_html(str(wikitable))
  tables = pd.read_html(str(wikitable))
  tables = pd.read_html(str(datatable))
  tables = pd.read_html(str(wikitable))
  tables = pd.read_html(str(datatable))
  tables = pd.read_html(str(datatable))
  tables = pd.read_html(str(wikitable))
  tables = pd.read_html(str(datatable))
  tables = pd.read_html(str(wikitable))
  tables = pd.read_html(str(wikitable))
  tables = pd.read_html(str(wikitable))
  tables = pd.read_html(str(datatable))
  tables = pd.read_html(str(wikitable))
  tables = pd.read_html(str(datatable))
  tables = pd.read_html(str(wikitable))
  tables = pd.read_html(str(wikitable))
  tables = pd.read_html(str(datatable))
  tables = 

In [27]:
# Saving Results
df = pd.DataFrame(list(answer.items()), columns=['Key', 'Value'])

df.to_excel('answer.xlsx', index=False)

In [28]:
df = pd.DataFrame(web_scrap, columns=['Data'])

df.to_excel('web_scrap.xlsx', index=False)

In [29]:
df = pd.DataFrame(threshold_list, columns=['Max Cosine similarity', 'Min Cosine similarity', 'Maxlen'])

df.to_excel('cosine.xlsx', index=False)