In [2]:
import requests
import os
from importlib.machinery import SourceFileLoader
import pandas as pd
try:
    config = SourceFileLoader("config", "config.py").load_module()
    os.environ['OPENAI_API_KEY'] = config.OPENAI_API_KEY

    bungie_api_key = config.BUNGIE_API_KEY
    endpoint = "https://www.bungie.net/Platform/Content/Rss/NewsArticles/{pageToken}/"
    page_token = "0"
    include_body = True
    headers = {
        "X-API-Key": bungie_api_key
    }
    params = {
        "includebody": include_body
    }

    results = []

    while page_token is not None:
        response = requests.get(endpoint.format(pageToken=page_token), headers=headers, params=params)

        if response.status_code == 200:
            json_response = response.json()
            if 'NewsArticles' in json_response['Response']:
                results.extend(json_response['Response']['NewsArticles'])
            else:
                print("No NewsArticles found in the response.")
                break
            page_token = json_response['Response']['NextPaginationToken']
        else:
            print("Error:", response.status_code)
            print("Response:", response.text)
            break
except Exception as e:
    print("An error occurred:", str(e))


An error occurred: 'NextPaginationToken'


In [3]:
results[0]

{'Title': 'Destiny 2 Hotfix 6/02/23',
 'Link': '/7/en/News/Article/destiny-2-hotfix-6-02-23',
 'PubDate': '2023-06-03T04:52:00Z',
 'UniqueIdentifier': 'blt612fd6dfe9162c9d',
 'Description': 'The one about a horn.',
 'HtmlContent': "<h4>GENERAL</h4><ul><li>Due to an issue, the Khepri's Horn exotic Titan helmet has been disabled in all activities.</li><li>Iron Banner's gametype has changed from Control to Eruption.</li></ul>",
 'ImagePath': 'https://images.contentstack.io/v3/assets/blte410e3b15535c144/blt09415b4dd64c163d/6389b9be07dd0d0d5a3c8e15/Update_Hotfix_Header.jpg',
 'OptionalMobileImagePath': 'https://images.contentstack.io/v3/assets/blte410e3b15535c144/bltb22972426ba11c0a/6389b9a4ebfc405e2908ae61/Update_Hotfix_Blog.jpg'}

In [4]:
print(f"Total Results: {len(results)}")

Total Results: 1014


In [6]:
# Convert the results list into a DataFrame
df = pd.DataFrame(results)

# Display the DataFrame
df.iloc[0]


Title                                               Destiny 2 Hotfix 6/02/23
Link                             /7/en/News/Article/destiny-2-hotfix-6-02-23
PubDate                                                 2023-06-03T04:52:00Z
UniqueIdentifier                                         blt612fd6dfe9162c9d
Description                                            The one about a horn.
HtmlContent                <h4>GENERAL</h4><ul><li>Due to an issue, the K...
ImagePath                  https://images.contentstack.io/v3/assets/blte4...
OptionalMobileImagePath    https://images.contentstack.io/v3/assets/blte4...
Name: 0, dtype: object

In [9]:
df = df[df['Title'].str.startswith(('This Week At Bungie', 'Destiny 2'))]
df = df.reset_index()
# Print the filtered DataFrame
df['Title']

In [24]:
from bs4 import BeautifulSoup

# Create a new column 'clean_text' in the DataFrame
df['clean_text'] = ''

# Iterate over the rows of the DataFrame
for index, row in df.iterrows():
    html_content = row['HtmlContent']
    
    # Create a BeautifulSoup object to parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Retrieve the text content using the .get_text() method
    clean_text = soup.get_text().strip()  # Apply strip() to remove leading/trailing whitespaces
    
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in clean_text.splitlines())
    
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    
    # drop blank lines
    clean_text = '\n'.join(chunk for chunk in chunks if chunk)

    # Assign the clean text to the 'clean_text' column of the current row
    df.at[index, 'clean_text'] = clean_text

In [41]:
df = df[['Title', 'Link', 'PubDate', 'UniqueIdentifier', 'Description',
       'clean_text']].iloc[0:25]

In [50]:
docs = []

for index, row in df.iterrows():
    doc = {
        'content': row['clean_text'],  # Assuming 'HtmlContent' contains the document text
        'meta': {
            'name': row['Title'],
            'link': row['Link'],
            'pub_date': row['PubDate'],
            'unique_id': row['UniqueIdentifier'],
            'description': row['Description']
        }
    }
    docs.append(doc)


In [44]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [45]:
from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore(use_bm25=True)


INFO - haystack.modeling.utils -  Using devices: CPU - Number of GPUs: 0


In [51]:
from haystack.nodes import PreProcessor

processor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=300,
    split_respect_sentence_boundary=True,
    split_overlap=10
)

docs = processor.process(docs)

Preprocessing:   0%|          | 0/25 [00:00<?, ?docs/s]



In [59]:
document_store.write_documents(docs)


Updating BM25 representation...:   0%|          | 0/190 [00:00<?, ? docs/s]

In [61]:
from haystack.nodes import BM25Retriever

retriever = BM25Retriever(document_store=document_store)


In [66]:
from haystack.nodes import FARMReader

reader = FARMReader(model_name_or_path="ahotrod/albert_xxlargev1_squad2_512")


INFO - haystack.modeling.utils -  Using devices: CPU - Number of GPUs: 0
INFO - haystack.modeling.utils -  Using devices: CPU - Number of GPUs: 0


Downloading (…)lve/main/config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -   * LOADING MODEL: 'ahotrod/albert_xxlargev1_squad2_512' (Albert)


Downloading pytorch_model.bin:   0%|          | 0.00/890M [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -  Auto-detected model language: english
INFO - haystack.modeling.model.language_model -  Loaded 'ahotrod/albert_xxlargev1_squad2_512' (Albert model) from model hub.


Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

INFO - haystack.modeling.utils -  Using devices: CPU - Number of GPUs: 0


In [67]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)

In [71]:
prediction = pipe.run(
    query="what was the change to the immortal?",
    params={
        "Retriever": {"top_k": 10},
        "Reader": {"top_k": 5}
    }
)


Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

In [72]:
from pprint import pprint

pprint(prediction)


{'answers': [<Answer {'answer': 'Reduced base range value by 10', 'type': 'extractive', 'score': 0.4572427272796631, 'context': '.45 to 1.5 (crit damage goes from 21.8 to 21).\xa0The Immortal\xa0Reduced base range value by 10.\xa0BowsFixed an issue where the Arsenic Bite Bow would displa', 'offsets_in_document': [{'start': 367, 'end': 397}], 'offsets_in_context': [{'start': 60, 'end': 90}], 'document_ids': ['cff4415fe2e69d3178c6c550321662f1'], 'meta': {'name': 'Destiny 2 Update 7.1.0 - Season of the Deep', 'link': '/7/en/News/Article/season-deep-update-7-1-0', 'pub_date': '2023-05-23T16:30:00Z', 'unique_id': 'blt56002d0e3c869908', 'description': 'The one about Season of the Deep.', '_split_id': 12, '_split_overlap': [{'doc_id': 'f80e490fa3813d2a187ebdddc7ffe19b', 'range': (0, 188)}, {'doc_id': 'b78ae1022598db992d26c83d1c67e4a7', 'range': (1636, 1774)}]}}>,
             <Answer {'answer': '(SMG)', 'type': 'extractive', 'score': 0.004935511387884617, 'context': ' to improve the process e

In [74]:
from haystack.utils import print_answers

print_answers(
    prediction,
    details="all" ## Choose from `minimum`, `medium`, and `all`
)


'Query: what was the change to the immortal?'
'Answers:'
[   <Answer {'answer': 'Reduced base range value by 10', 'type': 'extractive', 'score': 0.4572427272796631, 'context': '.45 to 1.5 (crit damage goes from 21.8 to 21).\xa0The Immortal\xa0Reduced base range value by 10.\xa0BowsFixed an issue where the Arsenic Bite Bow would displa', 'offsets_in_document': [{'start': 367, 'end': 397}], 'offsets_in_context': [{'start': 60, 'end': 90}], 'document_ids': ['cff4415fe2e69d3178c6c550321662f1'], 'meta': {'name': 'Destiny 2 Update 7.1.0 - Season of the Deep', 'link': '/7/en/News/Article/season-deep-update-7-1-0', 'pub_date': '2023-05-23T16:30:00Z', 'unique_id': 'blt56002d0e3c869908', 'description': 'The one about Season of the Deep.', '_split_id': 12, '_split_overlap': [{'doc_id': 'f80e490fa3813d2a187ebdddc7ffe19b', 'range': (0, 188)}, {'doc_id': 'b78ae1022598db992d26c83d1c67e4a7', 'range': (1636, 1774)}]}}>,
    <Answer {'answer': '(SMG)', 'type': 'extractive', 'score': 0.004935511387884617