In [15]:
import requests
import json
from pprint import pprint
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import os
from importlib.machinery import SourceFileLoader
import pinecone

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

config = SourceFileLoader("config", "config.py").load_module()
os.environ['OPENAI_API_KEY'] = config.OPENAI_API_KEY

bungie_api_key = config.BUNGIE_API_KEY
endpoint = "https://www.bungie.net/Platform/Content/Rss/NewsArticles/{pageToken}/"
page_token = "0"
include_body = True
headers = {
    "X-API-Key": bungie_api_key
}
params = {
    "includebody": include_body
}

results = []

while page_token is not None:
    response = requests.get(endpoint.format(pageToken=page_token), headers=headers, params=params)

    if response.status_code == 200:
        json_response = response.json()
        if 'NewsArticles' in json_response['Response']:
            results.extend(json_response['Response']['NewsArticles'])
        else:
            print("No NewsArticles found in the response.")
            break
        page_token = json_response['Response']['NextPaginationToken']
    else:
        print("Error:", response.status_code)
        print("Response:", response.text)
        break


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jorda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jorda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jorda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


KeyError: 'NextPaginationToken'

In [17]:
print(f"Total Results: {len(results)}")

Total Results: 1014


In [18]:
# Convert the results list into a DataFrame
df = pd.DataFrame(results)

# Display the DataFrame
df.iloc[0]


Title                                               Destiny 2 Hotfix 6/02/23
Link                             /7/en/News/Article/destiny-2-hotfix-6-02-23
PubDate                                                 2023-06-03T04:52:00Z
UniqueIdentifier                                         blt612fd6dfe9162c9d
Description                                            The one about a horn.
HtmlContent                <h4>GENERAL</h4><ul><li>Due to an issue, the K...
ImagePath                  https://images.contentstack.io/v3/assets/blte4...
OptionalMobileImagePath    https://images.contentstack.io/v3/assets/blte4...
Name: 0, dtype: object

In [19]:
df["Title"]

0                 Destiny 2 Hotfix 6/02/23
1         This Week At Bungie - 06/01/2023
2                 Destiny 2 Update 7.1.0.1
3                 Community Focus - Plumli
4                 Destiny 2 Hotfix 5/26/23
                       ...                
1009           Potentially Asked Questions
1010             Inside the new Bungie.net
1011                        This is a Test
1012    Happy 10th Anniversary, Xbox LIVE!
1013              Breaking In - Adam Brown
Name: Title, Length: 1014, dtype: object

In [20]:
df = df[df['Title'].str.startswith(('This Week At Bungie', 'Destiny 2'))]
df = df.reset_index()
# Print the filtered DataFrame
df['Title']


0                         Destiny 2 Hotfix 6/02/23
1                 This Week At Bungie - 06/01/2023
2                         Destiny 2 Update 7.1.0.1
3                         Destiny 2 Hotfix 5/26/23
4      Destiny 2 Update 7.1.0 - Season of the Deep
                          ...                     
330               This Week At Bungie - 03/03/2016
331               This Week At Bungie - 02/25/2016
332               This Week At Bungie - 02/18/2016
333               This Week At Bungie – 02/11/2016
334               This Week At Bungie – 02/04/2016
Name: Title, Length: 335, dtype: object

In [21]:
# Create a new column 'clean_text' in the DataFrame
df['clean_text'] = ''

# Iterate over the rows of the DataFrame
for index, row in df.iterrows():
    html_content = row['HtmlContent']
    
    # Create a BeautifulSoup object to parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Retrieve the text content using the .get_text() method
    clean_text = soup.get_text().strip()  # Apply strip() to remove leading/trailing whitespaces
    
    # Assign the clean text to the 'clean_text' column of the current row
    df.at[index, 'clean_text'] = clean_text

df.iloc[0]

index                                                                      0
Title                                               Destiny 2 Hotfix 6/02/23
Link                             /7/en/News/Article/destiny-2-hotfix-6-02-23
PubDate                                                 2023-06-03T04:52:00Z
UniqueIdentifier                                         blt612fd6dfe9162c9d
Description                                            The one about a horn.
HtmlContent                <h4>GENERAL</h4><ul><li>Due to an issue, the K...
ImagePath                  https://images.contentstack.io/v3/assets/blte4...
OptionalMobileImagePath    https://images.contentstack.io/v3/assets/blte4...
clean_text                 GENERALDue to an issue, the Khepri's Horn exot...
Name: 0, dtype: object

In [25]:
# Create a new column 'preproc_text' in the DataFrame
df['preproc_text'] = ''

# Convert text to lowercase
df['preproc_text'] = df['clean_text'].str.lower()

# Remove special characters and numbers
df['preproc_text'] = df['preproc_text'].apply(lambda text: re.sub(r'[^a-zA-Z\'-]', ' ', text))

# Remove redundant whitespace
df['preproc_text'] = df['preproc_text'].apply(lambda text: re.sub(r'\s+', ' ', text.strip()))


# Tokenization
#df['preproc_text'] = df['preproc_text'].apply(lambda text: nltk.word_tokenize(text))

# Remove stopwords
#stopwords_set = set(stopwords.words('english'))
#df['preproc_text'] = df['preproc_text'].apply(lambda tokens: [token for token in tokens if token not in stopwords_set])

# Lemmatization
#lemmatizer = WordNetLemmatizer()
#df['preproc_text'] = df['preproc_text'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

# Join tokens back into sentences
#df['preproc_text'] = df['preproc_text'].apply(lambda tokens: ' '.join(tokens))
df.iloc[0]

index                                                                      1
Title                                       This Week At Bungie - 06/01/2023
Link                                      /7/en/News/Article/06_01_2023_twab
PubDate                                                 2023-06-01T18:00:00Z
UniqueIdentifier                                         blt36382dc929299957
Description                This week at Bungie, we fought the ghosts of t...
HtmlContent                <p>Happy TWABsday, Guardians! How are you doin...
ImagePath                  https://images.contentstack.io/v3/assets/blte4...
OptionalMobileImagePath    https://images.contentstack.io/v3/assets/blte4...
clean_text                 Happy TWABsday, Guardians! How are you doing!?...
preproc_text               happy twabsday guardians how are you doing how...
Name: 1, dtype: object

In [33]:
selected_columns = ['Title', 'PubDate', 'UniqueIdentifier', 'Description', 'preproc_text']
subset_df = df[selected_columns].head(100)

In [97]:

def split_rows(df, max_length=512, overlap=20):
    rows = []
    for _, row in df.iterrows():
        text = row['preproc_text']
        num_chunks = (len(text) - max_length) // (max_length - overlap) + 1

        for i in range(num_chunks):
            start = i * (max_length - overlap)
            end = start + max_length

            # Adjust the end position to avoid splitting words
            if end < len(text) and not text[end].isspace():
                while end < len(text) and not text[end].isspace():
                    end -= 1

            new_row = row.copy()
            new_row['preproc_text'] = text[start:end]
            rows.append(new_row)

    return pd.DataFrame(rows)

split_df = split_rows(subset_df)

# Print the resulting split DataFrame
print(split_df[0])

                               Title               PubDate   
1   This Week At Bungie - 06/01/2023  2023-06-01T18:00:00Z  \
1   This Week At Bungie - 06/01/2023  2023-06-01T18:00:00Z   
1   This Week At Bungie - 06/01/2023  2023-06-01T18:00:00Z   
1   This Week At Bungie - 06/01/2023  2023-06-01T18:00:00Z   
1   This Week At Bungie - 06/01/2023  2023-06-01T18:00:00Z   
..                               ...                   ...   
99            Destiny 2 Update 3.0.1  2020-12-08T16:59:39Z   
99            Destiny 2 Update 3.0.1  2020-12-08T16:59:39Z   
99            Destiny 2 Update 3.0.1  2020-12-08T16:59:39Z   
99            Destiny 2 Update 3.0.1  2020-12-08T16:59:39Z   
99            Destiny 2 Update 3.0.1  2020-12-08T16:59:39Z   

       UniqueIdentifier                                        Description   
1   blt36382dc929299957  This week at Bungie, we fought the ghosts of t...  \
1   blt36382dc929299957  This week at Bungie, we fought the ghosts of t...   
1   blt36382dc9292999

In [107]:
len(split_df)

2106

In [50]:
pinecone_api_key = config.pinecone_api_key

In [77]:
# connect to pinecone environment
pinecone.init(
    api_key = pinecone_api_key,
    environment = "us-west1-gcp-free"
)


In [78]:
index_name = "extractive-question-answering"

In [90]:
pinecone.list_indexes()

['extractive-question-answering']

In [91]:
#pinecone.delete_index("extractive-question-answering")

In [109]:
# check if the extractive-question-answering index exists
if index_name not in pinecone.list_indexes():
    # create the index if it does not exist
    pinecone.create_index(
        index_name,
        dimension=384,
        metric="cosine"
    )

# connect to extractive-question-answering index we created
index = pinecone.Index(index_name)

In [60]:
import torch
from sentence_transformers import SentenceTransformer

# set device to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# load the retriever model from huggingface model hub
retriever = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1', device=device)
retriever

Downloading (…)5fedf/.gitattributes: 100%|██████████| 737/737 [00:00<?, ?B/s] 
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<?, ?B/s] 
Downloading (…)2cb455fedf/README.md: 100%|██████████| 11.5k/11.5k [00:00<?, ?B/s]
Downloading (…)b455fedf/config.json: 100%|██████████| 612/612 [00:00<?, ?B/s] 
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<?, ?B/s] 
Downloading (…)edf/data_config.json: 100%|██████████| 25.5k/25.5k [00:00<00:00, 25.4MB/s]
Downloading pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:00<00:00, 108MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 52.8kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<?, ?B/s] 
Downloading (…)5fedf/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 21.9MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 383/383 [00:00<00:00, 383kB/s]
Downloading (…)fedf/train_script.py: 100%|██████████| 13.8k/13.8k [00:00<?, ?B/s]
Downlo

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [110]:
from tqdm.auto import tqdm
# we will use batches of 64
batch_size = 64

for i in tqdm(range(0, len(split_df), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(split_df))
    # extract batch
    batch = split_df.iloc[i:i_end]
    # generate embeddings for batch
    emb = retriever.encode(batch["preproc_text"].tolist()).tolist()
    # get metadata
    meta = batch.to_dict(orient="records")
    # create unique IDs
    ids = [f"{idx}" for idx in range(i, i_end)]
    # add all to upsert list
    to_upsert = list(zip(ids, emb, meta))
    # upsert/insert these records to pinecone
    _ = index.upsert(vectors=to_upsert)

# check that we have all vectors in index
index.describe_index_stats()

100%|██████████| 33/33 [00:27<00:00,  1.21it/s]


{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 2106}},
 'total_vector_count': 2106}

In [82]:
from transformers import pipeline

model_name = "deepset/electra-base-squad2"
# load the reader model into a question-answering pipeline
reader = pipeline(tokenizer=model_name, model=model_name, task="question-answering", device=device)

Downloading (…)lve/main/config.json: 100%|██████████| 635/635 [00:00<?, ?B/s] 
Downloading pytorch_model.bin: 100%|██████████| 436M/436M [00:10<00:00, 41.4MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 200/200 [00:00<?, ?B/s] 
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 12.6MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<?, ?B/s] 


In [116]:
# gets context passages from the pinecone index
def get_context(question, top_k):
    # generate embeddings for the question
    xq = retriever.encode([question]).tolist()
    # search pinecone index for context passage with the answer
    xc = index.query(xq, top_k=top_k, include_metadata=True)
    # extract the context passage from pinecone search result
    c = [x["metadata"]["preproc_text"] for x in xc["matches"]]
    return c

question = "how much damage do aggressive frame smgs do now?"
context = get_context(question, top_k = 1)
context

["rail cannon projectiles will no longer deal flyby damage the offending damage that was framerate dependent they will now only apply damage on direct hits but will hit you a bit more often average damage output is about the same as you'd receive at fps fixed various issues where some how to toasts were set to a higher priority than some system messages fixed an issue where ultra combatants' health bars were not consistently respecting colorblind settings ultra-combatant health bars would display inconsistent"]

In [118]:
from pprint import pprint

# extracts answer from the context passage
def extract_answer(question, context):
    results = []
    for c in context:
        # feed the reader the question and contexts to extract answers
        answer = reader(question=question, context=c)
        # add the context to answer dict for printing both together
        answer["preproc_text"] = c
        results.append(answer)
    # sort the result based on the score from reader model
    sorted_result = pprint(sorted(results, key=lambda x: x["score"], reverse=True))
    return sorted_result

extract_answer(question, context)

[{'answer': 'only apply damage on direct hits',
  'end': 153,
  'preproc_text': 'rail cannon projectiles will no longer deal flyby damage '
                  'the offending damage that was framerate dependent they will '
                  'now only apply damage on direct hits but will hit you a bit '
                  "more often average damage output is about the same as you'd "
                  'receive at fps fixed various issues where some how to '
                  'toasts were set to a higher priority than some system '
                  "messages fixed an issue where ultra combatants' health bars "
                  'were not consistently respecting colorblind settings '
                  'ultra-combatant health bars would display inconsistent',
  'score': 0.005786481779068708,
  'start': 121}]


In [119]:
question = "What were the changes to graviton"
context = get_context(question, top_k=1)
extract_answer(question, context)

[{'answer': 'graviton',
  'end': 122,
  'preproc_text': ' and keyboard recoil penalty from to arbalest reduced '
                  'damage vs champions by will still break barriers in one hit '
                  'graviton lance catalyst changed from granting hidden hand '
                  'to granting vorpal weapon and turnabout grand overture '
                  'reduced time between bursts when in missile mode holding '
                  'the trigger will now fire all missiles in a continuous '
                  'burst tapping will fire -round bursts wavesplitter - void '
                  'update picking up an orb of power now grants s of maximum '
                  'power and caps at s up from and respectively now',
  'score': 0.23234397172927856,
  'start': 114}]


In [120]:
question = "who won guardian games?"
context = get_context(question, top_k=1)
extract_answer(question, context)

[{'answer': 'guardian games triumphs',
  'end': 58,
  'preproc_text': 'layed as winner for the event some guardian games triumphs '
                  'are activating earlier than intended as a result the titles '
                  'tab now flashes as though something is unlocked these '
                  'triumphs will be able to be claimed when guardian games '
                  'launches in two weeks the zone control playlist doesn t '
                  'display an increased crucible rank modifier dying in the '
                  'hypernet current strike s boss room can sometimes get a '
                  'player s ghost stuck in the floor quitter penalties were '
                  'erroneously enabled with launch for completing matches',
  'score': 1.445486130791096e-08,
  'start': 35}]


In [121]:
question = "what were the most recent changes to stasis titan?"
context = get_context(question, top_k=3)
extract_answer(question, context)

[{'answer': 'changes',
  'end': 512,
  'preproc_text': ' through hostile titan barricades barricades now slightly '
                  "protrude into the ground to better protect the titan's feet "
                  'on uneven ground this should reduce instances where '
                  'explosions and projectiles are able to sneak through the '
                  'bottom of the barricade and hit the titan behemoth as you '
                  'may remember midway through last season we released a '
                  'portion of the stasis-related changes originally intended '
                  'for this releasen we re pretty happy with the effect those '
                  'changes have had on the crucible but some of the changes',
  'score': 2.4935166820228005e-08,
  'start': 505},
 {'answer': 'replace barricades with just a single ice block',
  'end': 449,
  'preproc_text': 'h this one takes a chillier approach to combat for stasis '
                  'users with the glacial fortificati