In [1]:
import requests
import os
from importlib.machinery import SourceFileLoader
import pandas as pd
import logging
import torch
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)


try:
    config = SourceFileLoader("config", "config.py").load_module()
    os.environ['OPENAI_API_KEY'] = config.OPENAI_API_KEY

    bungie_api_key = config.BUNGIE_API_KEY
    endpoint = "https://www.bungie.net/Platform/Content/Rss/NewsArticles/{pageToken}/"
    page_token = "0"
    include_body = True
    headers = {
        "X-API-Key": bungie_api_key
    }
    params = {
        "includebody": include_body
    }

    results = []

    while page_token is not None:
        response = requests.get(endpoint.format(pageToken=page_token), headers=headers, params=params)

        if response.status_code == 200:
            json_response = response.json()
            if 'NewsArticles' in json_response['Response']:
                results.extend(json_response['Response']['NewsArticles'])
            else:
                print("No NewsArticles found in the response.")
                break
            page_token = json_response['Response']['NextPaginationToken']
        else:
            print("Error:", response.status_code)
            print("Response:", response.text)
            break
except Exception as e:
    print("An error occurred:", str(e))


An error occurred: 'NextPaginationToken'


In [2]:
results[0]

{'Title': 'Community Focus - CBGray',
 'Link': '/7/en/News/Article/community_focus_cbgray',
 'PubDate': '2023-06-09T18:00:00Z',
 'UniqueIdentifier': 'blt72c4f395173afe36',
 'Description': 'Just a self-proclaimed potato grateful to be given a chance.',
 'HtmlContent': '<p><span></span>Happy time zones Guardians, and welcome to this week\'s Community Focus. Today we are hanging out with CBgrey, a content creator, self-proclaimed potato, and a PvE shepherd. So, let’s get into it.&nbsp;&nbsp;</p><p><strong>Hi CB! First up, let’s talk about who you are, what your pronouns are, and what got you into gaming in the first place?</strong>&nbsp;</p><p>My name is Christian or CBgray but most just call me CB for short! My pronouns are he/him.&nbsp;&nbsp;</p><p>The name CBgray was a name that some of my friends called me growing up, that was just my first and middle initial and my last name! I am from Virginia originally, just outside of Washington, D.C., which will always be home to me. About a yea

In [4]:
# Convert the results list into a DataFrame
df = pd.DataFrame(results)


In [5]:
df = df[df['Title'].str.startswith(('This Week', 'Destiny 2'))]
df = df.iloc[0:25].reset_index()
# Print the filtered DataFrame
df['Title']

0               This Week In Destiny - 06/08/2023
1                        Destiny 2 Hotfix 6/02/23
2                This Week At Bungie - 06/01/2023
3                        Destiny 2 Update 7.1.0.1
4                        Destiny 2 Hotfix 5/26/23
5                This Week at Bungie - 05/25/2023
6     Destiny 2 Update 7.1.0 - Season of the Deep
7                This Week At Bungie - 05/18/2023
8                 This Week At Bungie – 5/11/2023
9                        Destiny 2 Hotfix 7.0.5.3
10               This Week at Bungie - 05/04/2023
11                       Destiny 2 Hotfix 7.0.5.2
12               This Week At Bungie – 04/27/2023
13          Destiny 2 Artist Reference Collection
14                       Destiny 2 Hotfix 7.0.5.1
15                This Week At Bungie - 4/20/2023
16                       Destiny 2 Update 7.0.5.0
17                This Week at Bungie - 4/13/2023
18               This Week At Bungie - 04/06/2023
19               This Week at Bungie - 03/30/2023


In [6]:
df["HtmlContent"].iloc[0]

'<p>This Week in Destiny... yeah, you read that right. As we mentioned <a href="https://www.bungie.net/7/en/News/article/twab-may-25-2023" target="_self">a couple of weeks ago</a>, we are fully leaning into this becoming a Destiny-centric space going forward. More on that in a bit! Before we jump into it, let\'s recap the contents of <a href="https://www.bungie.net/7/en/News/article/06_01_2023_twab" target="_self">the previous week</a>:</p><ul><li>Ghosts of the Deep rundown.</li><li>Bungie Rewards got new dungeon-themed merch.</li><li>Did you get your Marathon emblem?</li><li>You got new Prime Gaming loot.</li></ul><p>And here\'s what we have for the first TWID ever:</p><ul><li>A new name for the TWAB.</li><li>Ready for a new Community Fashion Contest?</li><li>Grandmasters will be back on June 13.</li><li>And Supremacy, too.</li><li>You all owe Kalli an apology.</li><li>Preview of some 7.1.0.2 fixes.</li><li>Collaboration art!</li><li>The usual Player Support Report, and our AOTW and M

In [21]:
from bs4 import BeautifulSoup
import re

def clean_html(raw_html):
    soup = BeautifulSoup(raw_html, "html.parser")
    cleantext = soup.get_text().lower()
    
    replacements = {
        r'([^a-zA-Z0-9.])([A-Z])': r'\1 \2',
        r'(\.)([a-zA-Z])': r'\1 \2',
        r'(\?)([a-zA-Z])': r'\1 \2',
        r'(!)([a-zA-Z])': r'\1 \2',
        r'(:)([a-zA-Z])': r'\1 \2',
        r'\.{3}': r'',
        r'@([a-zA-Z]+)': r''
    }
    
    for pattern, replacement in replacements.items():
        cleantext = re.sub(pattern, replacement, cleantext)
    
    return cleantext

raw_html = df["HtmlContent"].iloc[0]

cleaned_text = clean_html(raw_html)
print(cleaned_text)




this week in destiny yeah, you read that right. as we mentioned a couple of weeks ago, we are fully leaning into this becoming a destiny-centric space going forward. more on that in a bit! before we jump into it, let's recap the contents of the previous week: ghosts of the deep rundown. bungie rewards got new dungeon-themed merch. did you get your marathon emblem? you got new prime gaming loot. and here's what we have for the first twid ever: a new name for the twab. ready for a new community fashion contest? grandmasters will be back on june 13. and supremacy, too. you all owe kalli an apology. preview of some 7.1.0.2 fixes. collaboration art! the usual player support report, and our aotw and motw picks. rebranding the twabfor as long as anyone can remember, there has been a weekly check-in between bungie and our community. back in february of 2016, we rebranded the “bungie weekly update” to “this week at bungie.” this was done primarily because, at the time, there was a lot of confus

In [6]:
from bs4 import BeautifulSoup

# Create a new column 'clean_text' in the DataFrame
df['clean_text'] = ''

# Iterate over the rows of the DataFrame
for index, row in df.iterrows():
    html_content = row['HtmlContent']
    
    # Create a BeautifulSoup object to parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Retrieve the text content using the .get_text() method
    clean_text = soup.get_text().strip()  # Apply strip() to remove leading/trailing whitespaces
    
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in clean_text.splitlines())
    
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    
    # drop blank lines
    clean_text = '\n'.join(chunk for chunk in chunks if chunk)

    # Assign the clean text to the 'clean_text' column of the current row
    df.at[index, 'clean_text'] = clean_text

In [7]:
df = df[['Title', 'Link', 'PubDate', 'UniqueIdentifier', 'Description',
       'clean_text']]

In [8]:
docs = []
## convert to haystack format
for index, row in df.iterrows():
    doc = {
        'content': row['clean_text'],  
        'meta': {
            'name': row['Title'],
            'link': row['Link'],
            'pub_date': row['PubDate'],
            'unique_id': row['UniqueIdentifier'],
            'description': row['Description']
        }
    }
    docs.append(doc)


In [9]:
from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore(use_bm25=True)


INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


In [10]:
from haystack.nodes import PreProcessor

processor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True,
    split_overlap=50
)

docs = processor.process(docs)

Preprocessing:   0%|          | 0/25 [00:00<?, ?docs/s]



In [11]:
document_store.write_documents(docs)


INFO - haystack.document_stores.base -  Duplicate Documents: Document with id '4745c8710fd69e307f36d22f8a893c58' already exists in index 'document'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id 'a88448568c8c5281df70b881f18ef4a3' already exists in index 'document'


Updating BM25 representation...:   0%|          | 0/772 [00:00<?, ? docs/s]

In [12]:
from haystack.nodes import BM25Retriever

retriever = BM25Retriever(document_store=document_store)


In [29]:
from haystack.nodes import PromptNode, PromptTemplate

lfqa_prompt = PromptTemplate(
    name="lfqa",
    prompt_text="""Synthesize a comprehensive answer from the following text for the given question. 
                    Provide a clear and concise response that summarizes the key points and information presented in the text. 
                             Your answer should directly pull from the source but be organized. 
                             \n\n Related text: {join(documents)} \n\n Question: {query} \n\n Answer:""",
)

prompt_node = PromptNode(model_name_or_path="google/flan-t5-large", default_prompt_template=lfqa_prompt,  model_kwargs={"stream":True})


INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


In [14]:
from haystack.nodes import TransformersReader

reader = TransformersReader(model_name_or_path="ahotrod/albert_xxlargev1_squad2_512", use_gpu=True)

INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


In [25]:
from haystack.pipelines import Pipeline

pipe = Pipeline()
pipe.add_node(component=retriever, name="retriever", inputs=["Query"])
pipe.add_node(component=reader, name="reader", inputs=["retriever"])
pipe.add_node(component=prompt_node, name="prompt_node", inputs=["reader"])


In [32]:
from haystack.utils import print_answers

results = pipe.run(
    query="who won guardian games?",
    params={
        "retriever": {"top_k": 2},
        "reader": {"top_k": 10}
    }
)

print_answers(results, details="all", max_text_len=1000)




<pad> This week we’re back with some Season 21.</s>
'Query: who won guardian games?'
'Answers:'
[   <Answer {'answer': ' emotes', 'type': 'extractive', 'score': 3.2196183497035236e-07, 'context': 'your clips, running alongside some of you in Supremacy, and seeing the emotes in the Tower after you bank your Medallions. ', 'offsets_in_document': [{'start': 772, 'end': 779}], 'offsets_in_context': [{'start': 70, 'end': 77}], 'document_ids': ['36e502e455b6415363f4dddda144b3ed'], 'meta': {'name': 'This Week At Bungie – 5/11/2023', 'link': '/7/en/News/Article/05-011-2023-twab', 'pub_date': '2023-05-11T18:00:00Z', 'unique_id': 'blt8dcb8e715bde5605', 'description': 'This week at Bungie, we’ve got an armor preview and a dungeon date. ', '_split_id': 2, '_split_overlap': [{'doc_id': '282ee7f586356079e3c94cc66b05cee', 'range': (0, 660)}, {'doc_id': '84d02b1ac8d2dd7cb1660cd0af85e504', 'range': (661, 824)}]}}>,
    <Answer {'answer': ' the emotes', 'type': 'extractive', 'score': 1.6923993939599313e

In [19]:
from pprint import pprint

pprint(results)


{'answers': [<Answer {'answer': ' Season of the Deep,', 'type': 'extractive', 'score': 0.8737624883651733, 'context': 'Alright, that does it for us this week. We’re less than two weeks from Season of the Deep, so be sure you let us know over on Twitter what you’re most excited f', 'offsets_in_document': [{'start': 303, 'end': 323}], 'offsets_in_context': [{'start': 70, 'end': 90}], 'document_ids': ['b9b293933fa3f84ecbbc9aa95f7ce1d5'], 'meta': {'name': 'This Week At Bungie – 5/11/2023', 'link': '/7/en/News/Article/05-011-2023-twab', 'pub_date': '2023-05-11T18:00:00Z', 'unique_id': 'blt8dcb8e715bde5605', 'description': 'This week at Bungie, we’ve got an armor preview and a dungeon date. ', '_split_id': 33, '_split_overlap': [{'doc_id': 'ae33fa9b51a4a427ecfec5ae6aafb95f', 'range': (0, 461)}, {'doc_id': 'b0bfdf309865b1df62511f8d41ff393d', 'range': (273, 617)}]}}>,
             <Answer {'answer': ' Season of the Deep,', 'type': 'extractive', 'score': 0.8365026712417603, 'context': 'We’re le