In [83]:
import requests
import json
from pprint import pprint
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings

# Ignore specific warning(s)
warnings.filterwarnings("ignore")

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


api_key = "aebb0f3980c0466f8196511f2131b8fc"
endpoint = "https://www.bungie.net/Platform/Content/Rss/NewsArticles/{pageToken}/"
page_token = "0"
include_body = True
headers = {
    "X-API-Key": api_key
}
params = {
    "includebody": include_body
}

results = []

while page_token is not None:
    response = requests.get(endpoint.format(pageToken=page_token), headers=headers, params=params)

    if response.status_code == 200:
        json_response = response.json()
        if 'NewsArticles' in json_response['Response']:
            results.extend(json_response['Response']['NewsArticles'])
        else:
            print("No NewsArticles found in the response.")
            break
        page_token = json_response['Response']['NextPaginationToken']
    else:
        print("Error:", response.status_code)
        print("Response:", response.text)
        break

warnings.filterwarnings("default")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jordan.arlan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jordan.arlan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jordan.arlan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


KeyError: 'NextPaginationToken'

In [84]:
print(f"Total Results: {len(results)}")

Total Results: 1013


In [85]:
# Convert the results list into a DataFrame
df = pd.DataFrame(results)

# Display the DataFrame
df.iloc[0]


Title                                       This Week At Bungie - 06/01/2023
Link                                      /7/en/News/Article/06_01_2023_twab
PubDate                                                 2023-06-01T18:00:00Z
UniqueIdentifier                                         blt36382dc929299957
Description                This week at Bungie, we fought the ghosts of t...
HtmlContent                <p>Happy TWABsday, Guardians! How are you doin...
ImagePath                  https://images.contentstack.io/v3/assets/blte4...
OptionalMobileImagePath    https://images.contentstack.io/v3/assets/blte4...
Name: 0, dtype: object

In [86]:
selected_columns = ['Title', 'Description', 'PubDate']

# Display the selected columns DataFrame
print(df[selected_columns].iloc[0])

Title                           This Week At Bungie - 06/01/2023
Description    This week at Bungie, we fought the ghosts of t...
PubDate                                     2023-06-01T18:00:00Z
Name: 0, dtype: object


In [87]:
df["Title"]

0         This Week At Bungie - 06/01/2023
1                 Destiny 2 Update 7.1.0.1
2                 Community Focus - Plumli
3                 Destiny 2 Hotfix 5/26/23
4         This Week at Bungie - 05/25/2023
                       ...                
1008           Potentially Asked Questions
1009             Inside the new Bungie.net
1010                        This is a Test
1011    Happy 10th Anniversary, Xbox LIVE!
1012              Breaking In - Adam Brown
Name: Title, Length: 1013, dtype: object

In [88]:
df = df[df['Title'].str.startswith(('This Week At Bungie', 'Destiny 2'))]
df = df.reset_index()
# Print the filtered DataFrame
df['Title']


0                 This Week At Bungie - 06/01/2023
1                         Destiny 2 Update 7.1.0.1
2                         Destiny 2 Hotfix 5/26/23
3      Destiny 2 Update 7.1.0 - Season of the Deep
4                 This Week At Bungie - 05/18/2023
                          ...                     
329               This Week At Bungie - 03/03/2016
330               This Week At Bungie - 02/25/2016
331               This Week At Bungie - 02/18/2016
332               This Week At Bungie – 02/11/2016
333               This Week At Bungie – 02/04/2016
Name: Title, Length: 334, dtype: object

In [89]:
from bs4 import BeautifulSoup


In [90]:
# Create a new column 'clean_text' in the DataFrame
df['clean_text'] = ''

# Iterate over the rows of the DataFrame
for index, row in df.iterrows():
    html_content = row['HtmlContent']
    
    # Create a BeautifulSoup object to parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Retrieve the text content using the .get_text() method
    clean_text = soup.get_text().strip()  # Apply strip() to remove leading/trailing whitespaces
    
    # Assign the clean text to the 'clean_text' column of the current row
    df.at[index, 'clean_text'] = clean_text

df.iloc[0]

index                                                                      0
Title                                       This Week At Bungie - 06/01/2023
Link                                      /7/en/News/Article/06_01_2023_twab
PubDate                                                 2023-06-01T18:00:00Z
UniqueIdentifier                                         blt36382dc929299957
Description                This week at Bungie, we fought the ghosts of t...
HtmlContent                <p>Happy TWABsday, Guardians! How are you doin...
ImagePath                  https://images.contentstack.io/v3/assets/blte4...
OptionalMobileImagePath    https://images.contentstack.io/v3/assets/blte4...
clean_text                 Happy TWABsday, Guardians! How are you doing!?...
Name: 0, dtype: object

In [91]:
df.iloc[1]

index                                                                      1
Title                                               Destiny 2 Update 7.1.0.1
Link                                       /7/en/News/Article/update_7_1_0_1
PubDate                                                 2023-06-01T17:00:00Z
UniqueIdentifier                                         bltb24b43770c04fe3a
Description                                  The one about the firing delay.
HtmlContent                <h2>Activities</h2><h3>Crucible</h3><ul><li>Fi...
ImagePath                  https://images.contentstack.io/v3/assets/blte4...
OptionalMobileImagePath                                                  NaN
clean_text                 ActivitiesCrucibleFixed an issue where all pla...
Name: 1, dtype: object

In [92]:
df.iloc[0]["clean_text"]

'Happy TWABsday, Guardians! How are you doing!? How many fish have you caught? Have you skedaddled through the dungeon yet? What’s your favorite thing so far about the Season? What about... [deep breaths] Ok, ok, it\'s time to calm down. It’s just something special to see all your social posts tackling Ghosts of the Deep, working with Sloane, and did I mention fishing already? It gets us pretty hyped.Anyhoo, let’s get back on topic. Here is what we shared last week and what we are going to read about today.Last week in TWAB:Marathon was announced!Our new PlayStation crossover.Guardian Games has concluded.Save the Dates for Season of the Deep!Cutscenes for days.Guardian Ranks reminder.Aquanaut is now available.This week we’ve got:Ghosts of the Deep recap.Dungeon-themed items in the Bungie Store.Pride@Bungie announcement.M:\\STARTPrime Gaming update.The weekly update from the Player Support Team.Weekly Art of the Week and Movie of the week picks.The DungeonWere you one of the Guardians w

In [93]:
# Create a new column 'preproc_text' in the DataFrame
df['preproc_text'] = ''

# Convert text to lowercase
df['preproc_text'] = df['clean_text'].str.lower()

# Remove special characters and numbers
df['preproc_text'] = df['preproc_text'].apply(lambda text: re.sub(r'[^a-zA-Z\'-]', ' ', text))

# Remove redundant whitespace
df['preproc_text'] = df['preproc_text'].apply(lambda text: re.sub(r'\s+', ' ', text.strip()))


# Tokenization
#df['preproc_text'] = df['preproc_text'].apply(lambda text: nltk.word_tokenize(text))

# Remove stopwords
#stopwords_set = set(stopwords.words('english'))
#df['preproc_text'] = df['preproc_text'].apply(lambda tokens: [token for token in tokens if token not in stopwords_set])

# Lemmatization
#lemmatizer = WordNetLemmatizer()
#df['preproc_text'] = df['preproc_text'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

# Join tokens back into sentences
#df['preproc_text'] = df['preproc_text'].apply(lambda tokens: ' '.join(tokens))
df.iloc[1]

index                                                                      1
Title                                               Destiny 2 Update 7.1.0.1
Link                                       /7/en/News/Article/update_7_1_0_1
PubDate                                                 2023-06-01T17:00:00Z
UniqueIdentifier                                         bltb24b43770c04fe3a
Description                                  The one about the firing delay.
HtmlContent                <h2>Activities</h2><h3>Crucible</h3><ul><li>Fi...
ImagePath                  https://images.contentstack.io/v3/assets/blte4...
OptionalMobileImagePath                                                  NaN
clean_text                 ActivitiesCrucibleFixed an issue where all pla...
preproc_text               activitiescruciblefixed an issue where all pla...
Name: 1, dtype: object

In [94]:
df.iloc[0]["preproc_text"]

"happy twabsday guardians how are you doing how many fish have you caught have you skedaddled through the dungeon yet what s your favorite thing so far about the season what about deep breaths ok ok it's time to calm down it s just something special to see all your social posts tackling ghosts of the deep working with sloane and did i mention fishing already it gets us pretty hyped anyhoo let s get back on topic here is what we shared last week and what we are going to read about today last week in twab marathon was announced our new playstation crossover guardian games has concluded save the dates for season of the deep cutscenes for days guardian ranks reminder aquanaut is now available this week we ve got ghosts of the deep recap dungeon-themed items in the bungie store pride bungie announcement m startprime gaming update the weekly update from the player support team weekly art of the week and movie of the week picks the dungeonwere you one of the guardians who dove deep into a tit

In [100]:
import os
import openai
import pinecone
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain


In [99]:
df = df[['Title', 'PubDate', 'UniqueIdentifier', 'Description','preproc_text']]
df.iloc[0]

Title                                This Week At Bungie - 06/01/2023
PubDate                                          2023-06-01T18:00:00Z
UniqueIdentifier                                  blt36382dc929299957
Description         This week at Bungie, we fought the ghosts of t...
preproc_text        happy twabsday guardians how are you doing how...
Name: 0, dtype: object

In [101]:

loader = DataFrameLoader(df, page_content_column="preproc_text")
loader.load()

[Document(page_content="happy twabsday guardians how are you doing how many fish have you caught have you skedaddled through the dungeon yet what s your favorite thing so far about the season what about deep breaths ok ok it's time to calm down it s just something special to see all your social posts tackling ghosts of the deep working with sloane and did i mention fishing already it gets us pretty hyped anyhoo let s get back on topic here is what we shared last week and what we are going to read about today last week in twab marathon was announced our new playstation crossover guardian games has concluded save the dates for season of the deep cutscenes for days guardian ranks reminder aquanaut is now available this week we ve got ghosts of the deep recap dungeon-themed items in the bungie store pride bungie announcement m startprime gaming update the weekly update from the player support team weekly art of the week and movie of the week picks the dungeonwere you one of the guardians w

In [104]:
from config import OPENAI_API_KEY
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY


ImportError: cannot import name 'OPENAI_API_KEY' from 'config' (/Users/jordan.arlan/Documents/Python Scratch/config.py)

In [102]:
from langchain.indexes import VectorstoreIndexCreator
index = VectorstoreIndexCreator().from_loaders([loader])



ValidationError: 1 validation error for OpenAIEmbeddings
__root__
  Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass  `openai_api_key` as a named parameter. (type=value_error)

In [None]:
query = "What did the president say about Ketanji Brown Jackson"
index.query(query)

In [None]:
def split_docs(documents, chunk_size=1000, chunk_overlap=20):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  docs = text_splitter.split_documents(documents)
  return docs

docs = split_docs(documents)
print(len(docs))


AttributeError: 'str' object has no attribute 'page_content'