In [2]:
#!pip install sentence-transformers
#!pip install beautifulsoup4 requests sentence-transformers
#!pip install datasets
#!pip install --upgrade huggingface_hub
#!pip install fastparquet
#!pip install transformers[torch]
#!pip install --upgrade torch torchvision
#!pip install --upgrade pytorch
#!pip install --upgrade transformers
#!pip install transformers -U
#!pip install openai
#!pip install flask

In [1]:
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModelForCausalLM
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer, InputExample, evaluation, SentencesDataset, losses, SentenceTransformerTrainingArguments
import torch
from torch.utils.data import DataLoader
from torch.nn.functional import cosine_similarity
from sentence_transformers import SentencesDataset, losses
import faiss
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from scipy.stats import pearsonr
from sklearn.metrics import accuracy_score, f1_score
from dotenv import load_dotenv

## Part 1 Setup Vector Functions

In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')

def get_embeddings(texts):
    return model.encode(texts)



In [3]:
def fetch_wikipedia_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    content = soup.find(id='mw-content-text')
    paragraphs = [p.get_text() for p in content.find_all('p')]
    return paragraphs

In [4]:
def read_texts_from_directory(directory):
    texts = []
    filenames = []
    # Iterate through the files in the directory
    for filename in sorted(os.listdir(directory)):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                texts.append(file.read())
                filenames.append(filename)
    return texts, filenames

In [5]:
def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

In [6]:
"""def main(directory):
    texts, filenames = read_texts_from_directory(directory)
    embeddings = get_embeddings(texts)
    index = create_faiss_index(np.array(embeddings))
    return index, texts, filenames"""

'def main(directory):\n    texts, filenames = read_texts_from_directory(directory)\n    embeddings = get_embeddings(texts)\n    index = create_faiss_index(np.array(embeddings))\n    return index, texts, filenames'

In [7]:
def main(url):
    paragraphs = fetch_wikipedia_page(url)
    embeddings = get_embeddings(paragraphs)
    index = create_faiss_index(np.array(embeddings))
    return index, paragraphs

In [8]:
def update_index_with_new_page(url, index, paragraphs):
    new_paragraphs = fetch_wikipedia_page(url)
    new_embeddings = get_embeddings(new_paragraphs)
    if new_embeddings.size > 0:
        index.add(new_embeddings)
    paragraphs.extend(new_paragraphs)
    return index, paragraphs

In [9]:
def search_index(query, index, texts, k=5):
    query_embedding = model.encode([query])[0]
    distances, indices = index.search(np.array([query_embedding]), k)
    return [(texts[i], distances[0][j]) for j, i in enumerate(indices[0])]

## Part 2 - Testing Functionalities, import data from Wikipedia URL and perform search

In [10]:
#directory = 'text/'
#index, texts, filenames = main(directory)

url = 'https://en.wikipedia.org/wiki/National_Basketball_Association'
index, paragraphs = main(url)
url2 = 'https://en.wikipedia.org/wiki/LeBron_James'
index, paragraphs = update_index_with_new_page(url2, index, paragraphs)
url3= 'https://en.wikipedia.org/wiki/2024_NBA_playoffs'
index, paragraphs = update_index_with_new_page(url3, index, paragraphs)

#url = 'https://en.wikipedia.org/wiki/2024_NBA_playoffs'
#index, paragraphs = main(url)

In [11]:
query = "In what year was the NBA founded?"
results = search_index(query, index, paragraphs, 3)
for text, distance in results:
    print(f"Text: {text}, Distance: {distance}")

Text: The league was founded in New York City on June 6, 1946, as the Basketball Association of America (BAA).[1] It changed its name to the National Basketball Association on August 3, 1949, after merging with the competing National Basketball League (NBL).[4] In 1976, the NBA and the American Basketball Association (ABA) merged, adding four franchises to the NBA. The NBA's regular season runs from October to April, with each team playing 82 games. The league's playoff tournament extends into June, culminating with the NBA Finals championship series. As of 2020[update], NBA players are the world's best paid athletes by average annual salary per player.[5][6][7]
, Distance: 0.5894463062286377
Text: The Basketball Association of America was founded in 1946 by owners of the major ice hockey arenas in the Northeastern and Midwestern United States and Canada. On November 1, 1946, in Toronto, Ontario, Canada, the Toronto Huskies hosted the New York Knickerbockers at Maple Leaf Gardens, in a

In [12]:
query = "Tell me about NBA Playoff Eastern Conference Finals between Celtics and Pacers"
results = search_index(query, index, paragraphs)
for text, distance in results:
    print(f"Text: {text}\nDistance: {distance}\n")

Text: This was the seventh playoff meeting between these two teams, with the Celtics winning four of the first six meetings.[71]

Distance: 0.722591757774353

Text:  The top-seeded Celtics responded emphatically to their home loss with a wire-to-wire 104–84 victory over the Heat, retaking the series lead. Led by Jayson Tatum and Jaylen Brown's 22 points each, the Celtics improved upon their defense and held Miami to a season-low 84 points. Tatum also contributed 11 rebounds and six assists, while Kristaps Porziņģis and Derrick White added 18 and 16 points respectively. The Heat, who trailed by as much as 29, struggled to find an offensive rhythm, with Bam Adebayo and Tyler Herro combining on 13-of-34 (38%) from the field. With the win, the Celtics improved to 15–4 immediately following a loss this season and improved to 6–1 in their last seven road playoff games against Miami.

Distance: 0.7382923364639282

Text:  The Celtics' 7–0 run in the final 3:30 secured their first sweep in the 

In [13]:
def interact_with_user(index, paragraphs):
    while True:
        query = input("Enter your query (type 'exit' to quit): ")
        if query.lower() == 'exit':
            break
        k = int(input("How many results would you like to see? "))
        results = search_index(query, index, paragraphs, k)
        print("Top {} results:".format(k))
        for text, distance in results:
            print(f"Text: {text}\nDistance: {distance}\n")

In [14]:
interact_with_user(index, paragraphs)

Enter your query (type 'exit' to quit): exit


## Part 3 - Fine-Tuning Sentence BERT

In [14]:
dataset = pd.read_parquet('train-00000-of-00002.parquet', engine='fastparquet')

In [15]:
dataset = dataset.iloc[:10000]
dataset

Unnamed: 0,question,answer
0,is toprol xl the same as metoprolol?,Metoprolol succinate is also known by the bran...
1,are you experienced cd steve hoffman?,The Are You Experienced album was apparently m...
2,how are babushka dolls made?,"Matryoshka dolls are made of wood from lime, b..."
3,are eyes always the same size?,The eyes are always the same size from birth t...
4,how long do you have to wait to apply for cerb?,Re-apply for the CERB If you continue to meet ...
...,...,...
9995,is it against the law to drive without shoes?,"Despite conflicting information, it's not tech..."
9996,how auxins could cause hydrotropism?,What causes hydrotropism in plants? A class of...
9997,how many calories are in a jif peanut butter t...,"Jif To Go Creamy Peanut Butter, 12 oz: 250 cal..."
9998,what does vqa mean in bc?,BC VQA (Vintners Quality Alliance) is the appe...


In [17]:
dataset['label'] = 1
df_shuffled = dataset.sample(frac=1).reset_index(drop=True)

negative_pairs = pd.DataFrame({
    'question': dataset['question'],
    'answer': df_shuffled['answer'],
    'label': 0  # Label these pairs as negative
})

negative_pairs = negative_pairs[negative_pairs['answer'] != dataset['answer']]

In [19]:
combined_df = pd.concat([dataset, negative_pairs]).reset_index(drop=True)

In [20]:
train_df, val_df = train_test_split(combined_df, test_size=0.2, random_state=42)

In [25]:
train_examples = [InputExample(texts=[row['question'], row['answer']], label=row['label']) for index, row in train_df.iterrows()]
val_examples = [InputExample(texts=[row['question'], row['answer']], label=row['label']) for index, row in val_df.iterrows()]

In [26]:
train_dataset = SentencesDataset(train_examples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)

val_dataset = SentencesDataset(val_examples, model)
val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=16)

In [23]:
train_loss = losses.ContrastiveLoss(model=model)

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on device: {device.type}")

Training on device: cpu


In [36]:
evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(val_examples, name='val')

model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1)

Step,Training Loss
500,0.0617
1000,0.0451


### Validation

In [37]:
val_embeddings1 = model.encode([example.texts[0] for example in val_examples], convert_to_tensor=True)
val_embeddings2 = model.encode([example.texts[1] for example in val_examples], convert_to_tensor=True)

In [38]:
similarities = cosine_similarity(val_embeddings1, val_embeddings2)

In [39]:
similarity_scores = similarities.cpu().numpy()
true_labels = np.array([example.label for example in val_examples])

In [40]:
correlation, _ = pearsonr(similarity_scores, true_labels)
print(f"Pearson Correlation: {correlation}")

Pearson Correlation: 0.9629695601855343


In [42]:
predictions = (similarity_scores > 0.5).astype(int)

accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

Accuracy: 0.99025
F1 Score: 0.9900383141762452


## Part 4 - Implement LLM

In [14]:
load_dotenv()
access_token = os.getenv('HUGGING_FACE_KEY')
access_token = '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'
model_name = "google/gemma-2b"

In [15]:
# # Load tokenizer and model from Hugging Face using access token
# tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=access_token)
# model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=access_token)

# # Save tokenizer and model to cache
# tokenizer.save_pretrained(f"cache/tokenizer/{model_name}")
# model.save_pretrained(f"cache/model/{model_name}")

# Reload tokenizer and model from cache
tokenizer = AutoTokenizer.from_pretrained(f"cache/tokenizer/{model_name}")
llm_model = AutoModelForCausalLM.from_pretrained(f"cache/model/{model_name}")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [23]:
def generate_response(query, index, texts, model, tokenizer, k=5):
    search_results = search_index(query, index, texts, k)
    context = " ".join([text for text, _ in search_results])
    
    # Combine the query and the context with a clear separator
    input_text = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    
    outputs = model.generate(input_ids, max_new_tokens=150, temperature=0.7, top_p=0.9)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    answer_start = response.find("Answer:")
    if answer_start != -1:
        generated_answer = response[answer_start + len("Answer:"):].strip()
    else:
        generated_answer = response.strip()
    
    generated_answer = generated_answer.split("Question:")[0].strip()
    
    return generated_answer

In [24]:
query = "Tell me about NBA Playoff Eastern Conference Finals between Celtics and Pacers in 2024"
print(generate_response(query, index, paragraphs, llm_model, tokenizer))

The 2024 NBA Eastern Conference Finals will be a best-of-seven series between the Boston Celtics and the Indiana Pacers. The Celtics are the top seed in the Eastern Conference, while the Pacers are the sixth seed. The series will be played from April 20 to May 1, with the winner advancing to the 2024 NBA Finals.


## Part 5 - WebUI

In [66]:
from flask import Flask, render_template, escape, request, jsonify, render_template_string
import requests
import openai
import threading

app = Flask(__name__)

In [67]:
@app.route('/')
def home():
    return render_template_string('index.html')
    #return render_template_string(HTML_TEMPLATE)

@app.route('/result', methods=['POST'])
def result():
    user_query = request.form['query']
    print(user_query)
    response = generate_response(user_query, index, paragraphs, llm_model, tokenizer)
    print(response)
    return jsonify({'response': response})

def run_app():
    app.run(port=5000, use_reloader=False, debug=True)

In [68]:
def shutdown_server():
    func = request.environ.get('werkzeug.server.shutdown')
    if func is None:
        raise RuntimeError('Not running with the Werkzeug Server')
    func()

@app.route('/shutdown', methods=['POST'])
def shutdown():
    shutdown_server()
    return 'Server shutting down...'

def stop_flask():
    requests.post('http://localhost:5000/shutdown')

In [69]:
flask_thread = threading.Thread(target=run_app)
flask_thread.start()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [10/Jun/2024 22:43:07] "GET / HTTP/1.1" 200 -


Tell me about NBA Playoff Eastern Conference Finals between Celtics and Pacers in 2024


127.0.0.1 - - [10/Jun/2024 22:44:12] "POST /result HTTP/1.1" 200 -


The 2024 NBA Eastern Conference Finals will be a best-of-seven series between the Boston Celtics and the Indiana Pacers. The Celtics are the top seed in the Eastern Conference, while the Pacers are the sixth seed. The series will be played from April 20 to May 1, with the winner advancing to the 2024 NBA Finals.


In [65]:
# To stop the thread, if necessary
stop_flask()

  func()
127.0.0.1 - - [10/Jun/2024 22:42:19] "POST /shutdown HTTP/1.1" 200 -
