In [1]:
import requests
import os
from importlib.machinery import SourceFileLoader
import pandas as pd
import logging
import torch
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)


try:
    config = SourceFileLoader("config", "config.py").load_module()
    os.environ['OPENAI_API_KEY'] = config.OPENAI_API_KEY

    bungie_api_key = config.BUNGIE_API_KEY
    endpoint = "https://www.bungie.net/Platform/Content/Rss/NewsArticles/{pageToken}/"
    page_token = "0"
    include_body = True
    headers = {
        "X-API-Key": bungie_api_key
    }
    params = {
        "includebody": include_body
    }

    results = []

    while page_token is not None:
        response = requests.get(endpoint.format(pageToken=page_token), headers=headers, params=params)

        if response.status_code == 200:
            json_response = response.json()
            if 'NewsArticles' in json_response['Response']:
                results.extend(json_response['Response']['NewsArticles'])
            else:
                print("No NewsArticles found in the response.")
                break
            page_token = json_response['Response']['NextPaginationToken']
        else:
            print("Error:", response.status_code)
            print("Response:", response.text)
            break
except Exception as e:
    print("An error occurred:", str(e))


An error occurred: 'NextPaginationToken'


In [2]:
results[0]

{'Title': 'Destiny 2 Hotfix 6/02/23',
 'Link': '/7/en/News/Article/destiny-2-hotfix-6-02-23',
 'PubDate': '2023-06-03T04:52:00Z',
 'UniqueIdentifier': 'blt612fd6dfe9162c9d',
 'Description': 'The one about a horn.',
 'HtmlContent': "<h4>GENERAL</h4><ul><li>Due to an issue, the Khepri's Horn exotic Titan helmet has been disabled in all activities.</li><li>Iron Banner's gametype has changed from Control to Eruption.</li></ul>",
 'ImagePath': 'https://images.contentstack.io/v3/assets/blte410e3b15535c144/blt09415b4dd64c163d/6389b9be07dd0d0d5a3c8e15/Update_Hotfix_Header.jpg',
 'OptionalMobileImagePath': 'https://images.contentstack.io/v3/assets/blte410e3b15535c144/bltb22972426ba11c0a/6389b9a4ebfc405e2908ae61/Update_Hotfix_Blog.jpg'}

In [3]:
print(f"Total Results: {len(results)}")

Total Results: 1014


In [4]:
# Convert the results list into a DataFrame
df = pd.DataFrame(results)

# Display the DataFrame
df.iloc[0]


Title                                               Destiny 2 Hotfix 6/02/23
Link                             /7/en/News/Article/destiny-2-hotfix-6-02-23
PubDate                                                 2023-06-03T04:52:00Z
UniqueIdentifier                                         blt612fd6dfe9162c9d
Description                                            The one about a horn.
HtmlContent                <h4>GENERAL</h4><ul><li>Due to an issue, the K...
ImagePath                  https://images.contentstack.io/v3/assets/blte4...
OptionalMobileImagePath    https://images.contentstack.io/v3/assets/blte4...
Name: 0, dtype: object

In [5]:
df = df[df['Title'].str.startswith(('This Week At Bungie', 'Destiny 2'))]
df = df.iloc[0:25].reset_index()
# Print the filtered DataFrame
df['Title']

0                        Destiny 2 Hotfix 6/02/23
1                This Week At Bungie - 06/01/2023
2                        Destiny 2 Update 7.1.0.1
3                        Destiny 2 Hotfix 5/26/23
4     Destiny 2 Update 7.1.0 - Season of the Deep
5                This Week At Bungie - 05/18/2023
6                 This Week At Bungie – 5/11/2023
7                        Destiny 2 Hotfix 7.0.5.3
8                        Destiny 2 Hotfix 7.0.5.2
9                This Week At Bungie – 04/27/2023
10          Destiny 2 Artist Reference Collection
11                       Destiny 2 Hotfix 7.0.5.1
12                This Week At Bungie - 4/20/2023
13                       Destiny 2 Update 7.0.5.0
14               This Week At Bungie - 04/06/2023
15                       Destiny 2 Hotfix 7.0.0.7
16                       Destiny 2 Hotfix 7.0.0.6
17               This Week At Bungie – 03/23/2023
18               This Week At Bungie - 03/16/2023
19                       Destiny 2 Hotfix 7.0.0.5


In [6]:
from bs4 import BeautifulSoup

# Create a new column 'clean_text' in the DataFrame
df['clean_text'] = ''

# Iterate over the rows of the DataFrame
for index, row in df.iterrows():
    html_content = row['HtmlContent']
    
    # Create a BeautifulSoup object to parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Retrieve the text content using the .get_text() method
    clean_text = soup.get_text().strip()  # Apply strip() to remove leading/trailing whitespaces
    
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in clean_text.splitlines())
    
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    
    # drop blank lines
    clean_text = '\n'.join(chunk for chunk in chunks if chunk)

    # Assign the clean text to the 'clean_text' column of the current row
    df.at[index, 'clean_text'] = clean_text

In [25]:
df = df[['Title', 'Link', 'PubDate', 'UniqueIdentifier', 'Description',
       'clean_text']]

In [27]:
docs = []
## convert to haystack format
for index, row in df.iterrows():
    doc = {
        'content': row['clean_text'],  
        'meta': {
            'name': row['Title'],
            'link': row['Link'],
            'pub_date': row['PubDate'],
            'unique_id': row['UniqueIdentifier'],
            'description': row['Description']
        }
    }
    docs.append(doc)


In [28]:
from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore(use_bm25=True)


INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


In [29]:
from haystack.nodes import PreProcessor

processor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True,
    split_overlap=50
)

docs = processor.process(docs)

Preprocessing:   0%|          | 0/25 [00:00<?, ?docs/s]



In [30]:
document_store.write_documents(docs)


INFO - haystack.document_stores.base -  Duplicate Documents: Document with id '4745c8710fd69e307f36d22f8a893c58' already exists in index 'document'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id 'a88448568c8c5281df70b881f18ef4a3' already exists in index 'document'


Updating BM25 representation...:   0%|          | 0/772 [00:00<?, ? docs/s]

In [31]:
from haystack.nodes import BM25Retriever

retriever = BM25Retriever(document_store=document_store)


In [32]:
from haystack.nodes import PromptNode, PromptTemplate

lfqa_prompt = PromptTemplate(
    name="lfqa",
    prompt_text="""Synthesize a comprehensive answer from the following text for the given question. 
                    Provide a clear and concise response that summarizes the key points and information presented in the text. 
                             Your answer should be in your own words and be no longer than 50 words. 
                             \n\n Related text: {join(documents)} \n\n Question: {query} \n\n Answer:""",
)

prompt_node = PromptNode(model_name_or_path="google/flan-t5-large", default_prompt_template=lfqa_prompt)


INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [36]:
from haystack.nodes import TransformersReader

reader = TransformersReader(model_name_or_path="ahotrod/albert_xxlargev1_squad2_512", use_gpu=True)

INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


In [42]:
from haystack.pipelines import Pipeline

pipe2 = Pipeline()
pipe2.add_node(component=retriever, name="retriever", inputs=["Query"])
pipe2.add_node(component=reader, name="reader", inputs=["retriever"])
pipe2.add_node(component=prompt_node, name="prompt_node", inputs=["reader"])


In [47]:
pipe2.run(
    query="what are the changes to stasis titan",
    params={
        "retriever": {"top_k": 2},
        "reader": {"top_k": 10}
    }
)



{'results': ["Fixed an issue causing Verglas Curve's Stasis crystals to fail to spawn if shot at a titan bubble."],
 'invocation_context': {'query': 'what are the changes to stasis titan',
  'documents': [<Document: {'content': "Fixed an issue that caused Winterbite's melee to mistakenly count as an Energy weapon slot kill for the purposes of progression.\xa0Vexcalibur\xa0Vexcalibur's upgraded intrinsic traits at the crafting table now provide slight stat increases.\xa0Veriglas Curve\xa0Fixed an issue causing Verglas Curve's Stasis crystals to fail to spawn if shot at a titan bubble.\xa0Tractor Cannon\xa0Fixed an issue where Tractor Cannon was impacted by the recent non-lethal collision damage changes.Now causes hit targets to be able to suffer lethal collision damage for a brief duration.\xa0", 'content_type': 'text', 'score': 0.8403645018059493, 'meta': {'name': 'Destiny 2 Update 7.1.0 - Season of the Deep', 'link': '/7/en/News/Article/season-deep-update-7-1-0', 'pub_date': '2023-05-

In [15]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)

In [23]:
prediction = pipe.run(
    query="what was the most recent season?",
    params={
        "Retriever": {"top_k": 10},
        "Reader": {"top_k": 5}
    }
)


In [24]:
from pprint import pprint

pprint(prediction)


{'answers': [<Answer {'answer': ' Season of the Deep,', 'type': 'extractive', 'score': 0.8737624883651733, 'context': 'Alright, that does it for us this week. We’re less than two weeks from Season of the Deep, so be sure you let us know over on Twitter what you’re most excited f', 'offsets_in_document': [{'start': 303, 'end': 323}], 'offsets_in_context': [{'start': 70, 'end': 90}], 'document_ids': ['b9b293933fa3f84ecbbc9aa95f7ce1d5'], 'meta': {'name': 'This Week At Bungie – 5/11/2023', 'link': '/7/en/News/Article/05-011-2023-twab', 'pub_date': '2023-05-11T18:00:00Z', 'unique_id': 'blt8dcb8e715bde5605', 'description': 'This week at Bungie, we’ve got an armor preview and a dungeon date. ', '_split_id': 33, '_split_overlap': [{'doc_id': 'ae33fa9b51a4a427ecfec5ae6aafb95f', 'range': (0, 461)}, {'doc_id': 'b0bfdf309865b1df62511f8d41ff393d', 'range': (273, 617)}]}}>,
             <Answer {'answer': ' Season of the Deep,', 'type': 'extractive', 'score': 0.8365026712417603, 'context': 'We’re le

In [18]:
from haystack.utils import print_answers

print_answers(
    prediction,
    details="all" ## Choose from `minimum`, `medium`, and `all`
)


'Query: what was the change to the immortal?'
'Answers:'
[   <Answer {'answer': '\xa0Reduced base range value by 10.', 'type': 'extractive', 'score': 0.46526047587394714, 'context': 'lier from 1.45 to 1.5 (crit damage goes from 21.8 to 21).\xa0The Immortal\xa0Reduced base range value by 10.\xa0BowsFixed an issue where the Arsenic Bite Bow would display an incorr', 'offsets_in_document': [{'start': 100, 'end': 132}], 'offsets_in_context': [{'start': 70, 'end': 102}], 'document_ids': ['a5f4b49fddc9545d12242b22cbf0e15a'], 'meta': {'name': 'Destiny 2 Update 7.1.0 - Season of the Deep', 'link': '/7/en/News/Article/season-deep-update-7-1-0', 'pub_date': '2023-05-23T16:30:00Z', 'unique_id': 'blt56002d0e3c869908', 'description': 'The one about Season of the Deep.', '_split_id': 88, '_split_overlap': [{'doc_id': '2a97494f63ca60fc95f0f5750e965e55', 'range': (0, 346)}, {'doc_id': '1b5d6ec3289dc9cbd30d740f55e6fd36', 'range': (234, 590)}]}}>,
    <Answer {'answer': '\xa0Reduced base range value by 