In [15]:
import requests
import json
from pprint import pprint
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import os
from importlib.machinery import SourceFileLoader
import pinecone

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

config = SourceFileLoader("config", "config.py").load_module()
os.environ['OPENAI_API_KEY'] = config.OPENAI_API_KEY

bungie_api_key = config.BUNGIE_API_KEY
endpoint = "https://www.bungie.net/Platform/Content/Rss/NewsArticles/{pageToken}/"
page_token = "0"
include_body = True
headers = {
    "X-API-Key": bungie_api_key
}
params = {
    "includebody": include_body
}

results = []

while page_token is not None:
    response = requests.get(endpoint.format(pageToken=page_token), headers=headers, params=params)

    if response.status_code == 200:
        json_response = response.json()
        if 'NewsArticles' in json_response['Response']:
            results.extend(json_response['Response']['NewsArticles'])
        else:
            print("No NewsArticles found in the response.")
            break
        page_token = json_response['Response']['NextPaginationToken']
    else:
        print("Error:", response.status_code)
        print("Response:", response.text)
        break


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jorda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jorda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jorda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


KeyError: 'NextPaginationToken'

In [17]:
print(f"Total Results: {len(results)}")

Total Results: 1014


In [18]:
# Convert the results list into a DataFrame
df = pd.DataFrame(results)

# Display the DataFrame
df.iloc[0]


Title                                               Destiny 2 Hotfix 6/02/23
Link                             /7/en/News/Article/destiny-2-hotfix-6-02-23
PubDate                                                 2023-06-03T04:52:00Z
UniqueIdentifier                                         blt612fd6dfe9162c9d
Description                                            The one about a horn.
HtmlContent                <h4>GENERAL</h4><ul><li>Due to an issue, the K...
ImagePath                  https://images.contentstack.io/v3/assets/blte4...
OptionalMobileImagePath    https://images.contentstack.io/v3/assets/blte4...
Name: 0, dtype: object

In [19]:
df["Title"]

0                 Destiny 2 Hotfix 6/02/23
1         This Week At Bungie - 06/01/2023
2                 Destiny 2 Update 7.1.0.1
3                 Community Focus - Plumli
4                 Destiny 2 Hotfix 5/26/23
                       ...                
1009           Potentially Asked Questions
1010             Inside the new Bungie.net
1011                        This is a Test
1012    Happy 10th Anniversary, Xbox LIVE!
1013              Breaking In - Adam Brown
Name: Title, Length: 1014, dtype: object

In [20]:
df = df[df['Title'].str.startswith(('This Week At Bungie', 'Destiny 2'))]
df = df.reset_index()
# Print the filtered DataFrame
df['Title']


0                         Destiny 2 Hotfix 6/02/23
1                 This Week At Bungie - 06/01/2023
2                         Destiny 2 Update 7.1.0.1
3                         Destiny 2 Hotfix 5/26/23
4      Destiny 2 Update 7.1.0 - Season of the Deep
                          ...                     
330               This Week At Bungie - 03/03/2016
331               This Week At Bungie - 02/25/2016
332               This Week At Bungie - 02/18/2016
333               This Week At Bungie – 02/11/2016
334               This Week At Bungie – 02/04/2016
Name: Title, Length: 335, dtype: object

In [21]:
# Create a new column 'clean_text' in the DataFrame
df['clean_text'] = ''

# Iterate over the rows of the DataFrame
for index, row in df.iterrows():
    html_content = row['HtmlContent']
    
    # Create a BeautifulSoup object to parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Retrieve the text content using the .get_text() method
    clean_text = soup.get_text().strip()  # Apply strip() to remove leading/trailing whitespaces
    
    # Assign the clean text to the 'clean_text' column of the current row
    df.at[index, 'clean_text'] = clean_text

df.iloc[0]

index                                                                      0
Title                                               Destiny 2 Hotfix 6/02/23
Link                             /7/en/News/Article/destiny-2-hotfix-6-02-23
PubDate                                                 2023-06-03T04:52:00Z
UniqueIdentifier                                         blt612fd6dfe9162c9d
Description                                            The one about a horn.
HtmlContent                <h4>GENERAL</h4><ul><li>Due to an issue, the K...
ImagePath                  https://images.contentstack.io/v3/assets/blte4...
OptionalMobileImagePath    https://images.contentstack.io/v3/assets/blte4...
clean_text                 GENERALDue to an issue, the Khepri's Horn exot...
Name: 0, dtype: object

In [25]:
# Create a new column 'preproc_text' in the DataFrame
df['preproc_text'] = ''

# Convert text to lowercase
df['preproc_text'] = df['clean_text'].str.lower()

# Remove special characters and numbers
df['preproc_text'] = df['preproc_text'].apply(lambda text: re.sub(r'[^a-zA-Z\'-]', ' ', text))

# Remove redundant whitespace
df['preproc_text'] = df['preproc_text'].apply(lambda text: re.sub(r'\s+', ' ', text.strip()))


# Tokenization
#df['preproc_text'] = df['preproc_text'].apply(lambda text: nltk.word_tokenize(text))

# Remove stopwords
#stopwords_set = set(stopwords.words('english'))
#df['preproc_text'] = df['preproc_text'].apply(lambda tokens: [token for token in tokens if token not in stopwords_set])

# Lemmatization
#lemmatizer = WordNetLemmatizer()
#df['preproc_text'] = df['preproc_text'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

# Join tokens back into sentences
#df['preproc_text'] = df['preproc_text'].apply(lambda tokens: ' '.join(tokens))
df.iloc[0]

index                                                                      1
Title                                       This Week At Bungie - 06/01/2023
Link                                      /7/en/News/Article/06_01_2023_twab
PubDate                                                 2023-06-01T18:00:00Z
UniqueIdentifier                                         blt36382dc929299957
Description                This week at Bungie, we fought the ghosts of t...
HtmlContent                <p>Happy TWABsday, Guardians! How are you doin...
ImagePath                  https://images.contentstack.io/v3/assets/blte4...
OptionalMobileImagePath    https://images.contentstack.io/v3/assets/blte4...
clean_text                 Happy TWABsday, Guardians! How are you doing!?...
preproc_text               happy twabsday guardians how are you doing how...
Name: 1, dtype: object

In [125]:
selected_columns = ['Title', 'PubDate', 'UniqueIdentifier', 'Description', 'preproc_text']
subset_df = df[selected_columns].head(100)

In [127]:

def split_rows(df, max_length=384, overlap=20):
    rows = []
    for _, row in df.iterrows():
        text = row['preproc_text']
        num_chunks = (len(text) - max_length) // (max_length - overlap) + 1

        for i in range(num_chunks):
            start = i * (max_length - overlap)
            end = start + max_length

            # Adjust the end position to avoid splitting words
            if end < len(text) and not text[end].isspace():
                while end < len(text) and not text[end].isspace():
                    end -= 1

            new_row = row.copy()
            new_row['preproc_text'] = text[start:end]
            rows.append(new_row)

    return pd.DataFrame(rows)

split_df = split_rows(subset_df)

# Print the resulting split DataFrame
print(split_df)

                               Title               PubDate   
1   This Week At Bungie - 06/01/2023  2023-06-01T18:00:00Z  \
1   This Week At Bungie - 06/01/2023  2023-06-01T18:00:00Z   
1   This Week At Bungie - 06/01/2023  2023-06-01T18:00:00Z   
1   This Week At Bungie - 06/01/2023  2023-06-01T18:00:00Z   
1   This Week At Bungie - 06/01/2023  2023-06-01T18:00:00Z   
..                               ...                   ...   
99            Destiny 2 Update 3.0.1  2020-12-08T16:59:39Z   
99            Destiny 2 Update 3.0.1  2020-12-08T16:59:39Z   
99            Destiny 2 Update 3.0.1  2020-12-08T16:59:39Z   
99            Destiny 2 Update 3.0.1  2020-12-08T16:59:39Z   
99            Destiny 2 Update 3.0.1  2020-12-08T16:59:39Z   

       UniqueIdentifier                                        Description   
1   blt36382dc929299957  This week at Bungie, we fought the ghosts of t...  \
1   blt36382dc929299957  This week at Bungie, we fought the ghosts of t...   
1   blt36382dc9292999

In [128]:
len(split_df)

2868

In [129]:
pinecone_api_key = config.pinecone_api_key

In [130]:
# connect to pinecone environment
pinecone.init(
    api_key = pinecone_api_key,
    environment = "us-west1-gcp-free"
)


In [131]:
index_name = "extractive-question-answering"

In [132]:
pinecone.list_indexes()

['extractive-question-answering']

In [133]:
pinecone.delete_index("extractive-question-answering")

In [134]:
# check if the extractive-question-answering index exists
if index_name not in pinecone.list_indexes():
    # create the index if it does not exist
    pinecone.create_index(
        index_name,
        dimension=768,
        metric="cosine"
    )

# connect to extractive-question-answering index we created
index = pinecone.Index(index_name)

In [135]:
import torch
from sentence_transformers import SentenceTransformer

# set device to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# load the retriever model from huggingface model hub
retriever = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
retriever

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [136]:
from tqdm.auto import tqdm
# we will use batches of 64
batch_size = 64

for i in tqdm(range(0, len(split_df), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(split_df))
    # extract batch
    batch = split_df.iloc[i:i_end]
    # generate embeddings for batch
    emb = retriever.encode(batch["preproc_text"].tolist()).tolist()
    # get metadata
    meta = batch.to_dict(orient="records")
    # create unique IDs
    ids = [f"{idx}" for idx in range(i, i_end)]
    # add all to upsert list
    to_upsert = list(zip(ids, emb, meta))
    # upsert/insert these records to pinecone
    _ = index.upsert(vectors=to_upsert)

# check that we have all vectors in index
index.describe_index_stats()

100%|██████████| 45/45 [01:42<00:00,  2.28s/it]


{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 2816}},
 'total_vector_count': 2816}

In [137]:
from transformers import pipeline

model_name = "deepset/electra-base-squad2"
# load the reader model into a question-answering pipeline
reader = pipeline(tokenizer=model_name, model=model_name, task="question-answering", device=device)

In [138]:
# gets context passages from the pinecone index
def get_context(question, top_k):
    # generate embeddings for the question
    xq = retriever.encode([question]).tolist()
    # search pinecone index for context passage with the answer
    xc = index.query(xq, top_k=top_k, include_metadata=True)
    # extract the context passage from pinecone search result
    c = [x["metadata"]["preproc_text"] for x in xc["matches"]]
    return c

question = "how much damage do aggressive frame smgs do now?"
context = get_context(question, top_k = 1)
context

["e an airborne accuracy penalty are there any changes to aggressive aka spread fusion rifles no they're not affected by any of the fusion rifle tuning do all smgs benefit from the damage falloff increase yes do all smgs with lower than zoom now have no only those listed in the twab have been raised to what happened to the full-auto setting we mentioned that this is coming in a"]

In [139]:
from pprint import pprint

# extracts answer from the context passage
def extract_answer(question, context):
    results = []
    for c in context:
        # feed the reader the question and contexts to extract answers
        answer = reader(question=question, context=c)
        # add the context to answer dict for printing both together
        answer["preproc_text"] = c
        results.append(answer)
    # sort the result based on the score from reader model
    sorted_result = pprint(sorted(results, key=lambda x: x["score"], reverse=True))
    return sorted_result

extract_answer(question, context)

[{'answer': 'damage falloff increase yes do all smgs with lower than zoom now '
            'have',
  'end': 248,
  'preproc_text': 'e an airborne accuracy penalty are there any changes to '
                  "aggressive aka spread fusion rifles no they're not affected "
                  'by any of the fusion rifle tuning do all smgs benefit from '
                  'the damage falloff increase yes do all smgs with lower than '
                  'zoom now have no only those listed in the twab have been '
                  'raised to what happened to the full-auto setting we '
                  'mentioned that this is coming in a',
  'score': 5.2150696955299836e-11,
  'start': 179}]


In [140]:
question = "What were the changes to graviton"
context = get_context(question, top_k=1)
extract_answer(question, context)

[{'answer': 'glaive melee base damage reduced from to grenade launchers',
  'end': 183,
  'preproc_text': 'laive melee damage multipliers reduced by - against '
                  'champions minibosses bosses and vehicles majors and minors '
                  'are unchanged glaive melee base damage reduced from to '
                  'grenade launchers the queenbreaker and grand '
                  'overtureswitched from the old blinding screen effect to the '
                  'new arc blind one with reduced screen effect brightness '
                  'hand cannonrose has new stats increased range from to',
  'score': 0.6456788182258606,
  'start': 125}]


In [141]:
question = "who won guardian games?"
context = get_context(question, top_k=1)
extract_answer(question, context)

[{'answer': 'fireteams of guardians',
  'end': 210,
  'preproc_text': 'ntures to be had and rewards to be won are you pumped yet '
                  'well watch this on top of the overall class vs class main '
                  'event we have also assembled a friendly out-of-game '
                  'competition between fireteams of guardians from across the '
                  'globe we have creators from territories representing their '
                  'favorite class and competing for glory in the guardian '
                  'games cup check out the full details and',
  'score': 2.550037606852129e-06,
  'start': 188}]


In [142]:
question = "what were the most recent changes to stasis titan?"
context = get_context(question, top_k=3)
extract_answer(question, context)

[{'answer': 'non-lethal collision damage changes',
  'end': 272,
  'preproc_text': 'fting table now provide slight stat increases veriglas '
                  "curve fixed an issue causing verglas curve's stasis "
                  'crystals to fail to spawn if shot at a titan bubble tractor '
                  'cannon fixed an issue where tractor cannon was impacted by '
                  'the recent non-lethal collision damage changes now causes '
                  'hit targets to be able to suffer lethal collision damage '
                  'for a brief duration thunderlord fixed an',
  'score': 0.451644629240036,
  'start': 237},
 {'answer': 'now works with stasis subclasses as intended',
  'end': 187,
  'preproc_text': 'pauldrons in pvp game modes reduced super-energy gain by '
                  "shinobu's vow in pvp game modes reduced grenade-energy gain "
                  'per hit by chromatic fire now works with stasis subclasses '
                  'as intended crown of tempe