In [1]:
import requests
import os
from importlib.machinery import SourceFileLoader
import pandas as pd
import logging
import torch
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)


try:
    config = SourceFileLoader("config", "config.py").load_module()
    os.environ['OPENAI_API_KEY'] = config.OPENAI_API_KEY

    bungie_api_key = config.BUNGIE_API_KEY
    endpoint = "https://www.bungie.net/Platform/Content/Rss/NewsArticles/{pageToken}/"
    page_token = "0"
    include_body = True
    headers = {
        "X-API-Key": bungie_api_key
    }
    params = {
        "includebody": include_body
    }

    results = []

    while page_token is not None:
        response = requests.get(endpoint.format(pageToken=page_token), headers=headers, params=params)

        if response.status_code == 200:
            json_response = response.json()
            if 'NewsArticles' in json_response['Response']:
                results.extend(json_response['Response']['NewsArticles'])
            else:
                print("No NewsArticles found in the response.")
                break
            page_token = json_response['Response']['NextPaginationToken']
        else:
            print("Error:", response.status_code)
            print("Response:", response.text)
            break
except Exception as e:
    print("An error occurred:", str(e))


An error occurred: 'NextPaginationToken'


In [2]:
results[0]

{'Title': 'Destiny 2 Update 7.1.0.2',
 'Link': '/7/en/News/Article/update_7_1_0_2',
 'PubDate': '2023-06-13T15:00:00Z',
 'UniqueIdentifier': 'blteaf9327f612fadd7',
 'Description': 'The one about Armor fixes',
 'HtmlContent': '<h2>Activities  </h2><h3>Crucible </h3><ul><li>Fixed an issue where the Dazzling Iridescence Trials emblem could be awarded from the Flawless chest without completing the necessary requirements. <ul><li>Players who have already acquired the emblem will need to complete the requirements after this patch goes live in order to equip the emblem. </li></ul></li><li>Fixed an issue with spawn trapping that could occur on meltdown. </li></ul><h3>Raids and Dungeons </h3><ul><li>Fixed an issue where players can damage bosses in Ghosts of the Deep through immunity shields. </li></ul><h2>Gameplay and Investment  </h2><h3>Armor </h3><ul><li>Gyrfalcon\'s Hauberk\'s reserve overshield now deploys when a player uses Ensnaring Slam. </li><li>Fixed an issue where Khepri\'s Horn cou

In [37]:
# Convert the results list into a DataFrame
df = pd.DataFrame(results)


In [38]:
df = df[df['Title'].str.startswith(('Destiny 2'))]
df = df.iloc[0:25].reset_index()
# Print the filtered DataFrame
df['Title']

0                        Destiny 2 Update 7.1.0.2
1                        Destiny 2 Hotfix 6/02/23
2                        Destiny 2 Update 7.1.0.1
3                        Destiny 2 Hotfix 5/26/23
4     Destiny 2 Update 7.1.0 - Season of the Deep
5                        Destiny 2 Hotfix 7.0.5.3
6                        Destiny 2 Hotfix 7.0.5.2
7           Destiny 2 Artist Reference Collection
8                        Destiny 2 Hotfix 7.0.5.1
9                        Destiny 2 Update 7.0.5.0
10                       Destiny 2 Hotfix 7.0.0.7
11                       Destiny 2 Hotfix 7.0.0.6
12                       Destiny 2 Hotfix 7.0.0.5
13                       Destiny 2 Hotfix 7.0.0.3
14      Destiny 2: New and Returning Player Guide
15                       Destiny 2 Update 7.0.0.1
16                       Destiny 2 Hotfix 6.3.0.7
17                       Destiny 2 Hotfix 6.3.0.5
18                       Destiny 2 Hotfix 6.3.0.4
19                       Destiny 2 Hotfix 6.3.0.3


In [39]:
df["HtmlContent"].iloc[0]

'<h2>Activities  </h2><h3>Crucible </h3><ul><li>Fixed an issue where the Dazzling Iridescence Trials emblem could be awarded from the Flawless chest without completing the necessary requirements. <ul><li>Players who have already acquired the emblem will need to complete the requirements after this patch goes live in order to equip the emblem. </li></ul></li><li>Fixed an issue with spawn trapping that could occur on meltdown. </li></ul><h3>Raids and Dungeons </h3><ul><li>Fixed an issue where players can damage bosses in Ghosts of the Deep through immunity shields. </li></ul><h2>Gameplay and Investment  </h2><h3>Armor </h3><ul><li>Gyrfalcon\'s Hauberk\'s reserve overshield now deploys when a player uses Ensnaring Slam. </li><li>Fixed an issue where Khepri\'s Horn could disable various non-enemy objects. </li><li>Fixed an issue where Vesper of Radius\'s effects were triggering on things other than casting a rift. </li><li>Fixed an issue where Point-Contact Cannon Brace would create lightn

In [40]:

def extract_list_from_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    list_items = soup.find_all('li')
    result = [li.get_text(strip=True) for li in list_items]
    return result

def clean_html(text):
    cleantext = re.sub(r'([.,?!;:])((?!\s)|$)', r'\1 ', text)
    return cleantext

def parse_html_content(df):
    df['parsed_content'] = df['HtmlContent'].apply(extract_list_from_html)
    df['joined_content'] = df['parsed_content'].apply(lambda x: ' '.join(x))
    df['joined_content'] = df['joined_content'].apply(clean_html)
    return df

In [41]:
# Parse HTML content and create new column
df = parse_html_content(df)

In [42]:
df = df[['Title', 'Link', 'PubDate', 'UniqueIdentifier', 'Description',
       'joined_content']]

In [45]:
print(df.iloc[0]["joined_content"])

Fixed an issue where the Dazzling Iridescence Trials emblem could be awarded from the Flawless chest without completing the necessary requirements. Players who have already acquired the emblem will need to complete the requirements after this patch goes live in order to equip the emblem. Players who have already acquired the emblem will need to complete the requirements after this patch goes live in order to equip the emblem. Fixed an issue with spawn trapping that could occur on meltdown. Fixed an issue where players can damage bosses in Ghosts of the Deep through immunity shields. Gyrfalcon's Hauberk's reserve overshield now deploys when a player uses Ensnaring Slam. Fixed an issue where Khepri's Horn could disable various non-enemy objects. Fixed an issue where Vesper of Radius's effects were triggering on things other than casting a rift. Fixed an issue where Point-Contact Cannon Brace would create lightning strikes on melees other than Thunderclap. Fixed an issue where becoming fr

In [8]:
docs = []
## convert to haystack format
for index, row in df.iterrows():
    doc = {
        'content': row['joined_content'],  
        'meta': {
            'name': row['Title'],
            'link': row['Link'],
            'pub_date': row['PubDate'],
            'unique_id': row['UniqueIdentifier'],
            'description': row['Description']
        }
    }
    docs.append(doc)


In [9]:
from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore(use_bm25=True)


INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


In [10]:
from haystack.nodes import PreProcessor

processor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True,
    split_overlap=50
)

docs = processor.process(docs)

Preprocessing:   0%|          | 0/25 [00:00<?, ?docs/s]



In [11]:
document_store.write_documents(docs)


INFO - haystack.document_stores.base -  Duplicate Documents: Document with id '4745c8710fd69e307f36d22f8a893c58' already exists in index 'document'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id 'a88448568c8c5281df70b881f18ef4a3' already exists in index 'document'


Updating BM25 representation...:   0%|          | 0/772 [00:00<?, ? docs/s]

In [12]:
from haystack.nodes import BM25Retriever

retriever = BM25Retriever(document_store=document_store)


In [29]:
from haystack.nodes import PromptNode, PromptTemplate

lfqa_prompt = PromptTemplate(
    name="lfqa",
    prompt_text="""Synthesize a comprehensive answer from the following text for the given question. 
                    Provide a clear and concise response that summarizes the key points and information presented in the text. 
                             Your answer should directly pull from the source but be organized. 
                             \n\n Related text: {join(documents)} \n\n Question: {query} \n\n Answer:""",
)

prompt_node = PromptNode(model_name_or_path="google/flan-t5-large", default_prompt_template=lfqa_prompt,  model_kwargs={"stream":True})


INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


In [14]:
from haystack.nodes import TransformersReader

reader = TransformersReader(model_name_or_path="ahotrod/albert_xxlargev1_squad2_512", use_gpu=True)

INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


In [25]:
from haystack.pipelines import Pipeline

pipe = Pipeline()
pipe.add_node(component=retriever, name="retriever", inputs=["Query"])
pipe.add_node(component=reader, name="reader", inputs=["retriever"])
pipe.add_node(component=prompt_node, name="prompt_node", inputs=["reader"])


In [32]:
from haystack.utils import print_answers

results = pipe.run(
    query="who won guardian games?",
    params={
        "retriever": {"top_k": 2},
        "reader": {"top_k": 10}
    }
)

print_answers(results, details="all", max_text_len=1000)




<pad> This week we’re back with some Season 21.</s>
'Query: who won guardian games?'
'Answers:'
[   <Answer {'answer': ' emotes', 'type': 'extractive', 'score': 3.2196183497035236e-07, 'context': 'your clips, running alongside some of you in Supremacy, and seeing the emotes in the Tower after you bank your Medallions. ', 'offsets_in_document': [{'start': 772, 'end': 779}], 'offsets_in_context': [{'start': 70, 'end': 77}], 'document_ids': ['36e502e455b6415363f4dddda144b3ed'], 'meta': {'name': 'This Week At Bungie – 5/11/2023', 'link': '/7/en/News/Article/05-011-2023-twab', 'pub_date': '2023-05-11T18:00:00Z', 'unique_id': 'blt8dcb8e715bde5605', 'description': 'This week at Bungie, we’ve got an armor preview and a dungeon date. ', '_split_id': 2, '_split_overlap': [{'doc_id': '282ee7f586356079e3c94cc66b05cee', 'range': (0, 660)}, {'doc_id': '84d02b1ac8d2dd7cb1660cd0af85e504', 'range': (661, 824)}]}}>,
    <Answer {'answer': ' the emotes', 'type': 'extractive', 'score': 1.6923993939599313e

In [19]:
from pprint import pprint

pprint(results)


{'answers': [<Answer {'answer': ' Season of the Deep,', 'type': 'extractive', 'score': 0.8737624883651733, 'context': 'Alright, that does it for us this week. We’re less than two weeks from Season of the Deep, so be sure you let us know over on Twitter what you’re most excited f', 'offsets_in_document': [{'start': 303, 'end': 323}], 'offsets_in_context': [{'start': 70, 'end': 90}], 'document_ids': ['b9b293933fa3f84ecbbc9aa95f7ce1d5'], 'meta': {'name': 'This Week At Bungie – 5/11/2023', 'link': '/7/en/News/Article/05-011-2023-twab', 'pub_date': '2023-05-11T18:00:00Z', 'unique_id': 'blt8dcb8e715bde5605', 'description': 'This week at Bungie, we’ve got an armor preview and a dungeon date. ', '_split_id': 33, '_split_overlap': [{'doc_id': 'ae33fa9b51a4a427ecfec5ae6aafb95f', 'range': (0, 461)}, {'doc_id': 'b0bfdf309865b1df62511f8d41ff393d', 'range': (273, 617)}]}}>,
             <Answer {'answer': ' Season of the Deep,', 'type': 'extractive', 'score': 0.8365026712417603, 'context': 'We’re le