In [1]:
import requests
import json
from pprint import pprint
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


api_key = "aebb0f3980c0466f8196511f2131b8fc"
endpoint = "https://www.bungie.net/Platform/Content/Rss/NewsArticles/{pageToken}/"
page_token = "0"
include_body = True
headers = {
    "X-API-Key": api_key
}
params = {
    "includebody": include_body
}

results = []

while page_token is not None:
    response = requests.get(endpoint.format(pageToken=page_token), headers=headers, params=params)

    if response.status_code == 200:
        json_response = response.json()
        if 'NewsArticles' in json_response['Response']:
            results.extend(json_response['Response']['NewsArticles'])
        else:
            print("No NewsArticles found in the response.")
            break
        page_token = json_response['Response']['NextPaginationToken']
    else:
        print("Error:", response.status_code)
        print("Response:", response.text)
        break



In [None]:
print(f"Total Results: {len(results)}")

In [24]:
# Convert the results list into a DataFrame
df = pd.DataFrame(results)

# Display the DataFrame
df.iloc[0]


Title                                       This Week At Bungie - 05/18/2023
Link                         /7/en/News/Article/this_week_at_bungie_05_18_23
PubDate                                                 2023-05-18T18:00:00Z
UniqueIdentifier                                         blt00659ca627b99132
Description                                            Under the methane sea
HtmlContent                <p>This Week at Bungie, we are saying goodbye ...
ImagePath                  https://images.contentstack.io/v3/assets/blte4...
OptionalMobileImagePath    https://images.contentstack.io/v3/assets/blte4...
Name: 0, dtype: object

In [25]:
selected_columns = ['Title', 'Description', 'PubDate']

# Display the selected columns DataFrame
print(df[selected_columns].iloc[0])

Title          This Week At Bungie - 05/18/2023
Description               Under the methane sea
PubDate                    2023-05-18T18:00:00Z
Name: 0, dtype: object


In [26]:
df["Title"]

0            This Week At Bungie - 05/18/2023
1            Season 21 Weapon Changes Preview
2             This Week At Bungie – 5/11/2023
3       Season 21 Exotic Armor Tuning Preview
4                    Destiny 2 Hotfix 7.0.5.3
                        ...                  
1000              Potentially Asked Questions
1001                Inside the new Bungie.net
1002                           This is a Test
1003       Happy 10th Anniversary, Xbox LIVE!
1004                 Breaking In - Adam Brown
Name: Title, Length: 1005, dtype: object

In [28]:
from bs4 import BeautifulSoup

# Assuming you have an HTML string stored in the 'html' variable
html = df.iloc[0]["HtmlContent"]

# Create a BeautifulSoup object to parse the HTML
soup = BeautifulSoup(html, 'html.parser')

# Retrieve the text content using the .get_text() method
text_content = soup.get_text()

# Print the extracted text content
print(text_content)


This Week at Bungie, we are saying goodbye to Season of Defiance and welcoming Season of the Deep! In other words, it's the last week of the Season and we have tons of new content dropping in less than five days. So, let's start on the right foot recalling what we talked about last week, shall we? Last week topics: A preview of Season 21 armor and weapons. How the second week of Guardian Games went.We teased the new dungeon. Eververse got a storefront update.And now, what's on the menu for today:An overview of all major changes coming with Season 21. A first look at the three new Strand Aspects. An in-depth preview of the Seasonal Artifact. An update to how enemy shields look.A note from the Engineering Team.Updated standings of the Guardian Games and Guardian Games Cup. Introducing Asians@Bungie! The new Twitch Bounty emblem. Now that you all know what's coming, let's scuba dive into the TWAB, shall we? A TL;DR List of Season 21 Changes With Season of the Deep almost upon us, and with

In [33]:
# Create a new column 'clean_text' in the DataFrame
df['clean_text'] = ''

# Iterate over the rows of the DataFrame
for index, row in df.iterrows():
    html_content = row['HtmlContent']
    
    # Create a BeautifulSoup object to parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Retrieve the text content using the .get_text() method
    clean_text = soup.get_text().strip()  # Apply strip() to remove leading/trailing whitespaces
    
    # Assign the clean text to the 'clean_text' column of the current row
    df.at[index, 'clean_text'] = clean_text

df.iloc[0]

Title                                       This Week At Bungie - 05/18/2023
Link                         /7/en/News/Article/this_week_at_bungie_05_18_23
PubDate                                                 2023-05-18T18:00:00Z
UniqueIdentifier                                         blt00659ca627b99132
Description                                            Under the methane sea
HtmlContent                <p>This Week at Bungie, we are saying goodbye ...
ImagePath                  https://images.contentstack.io/v3/assets/blte4...
OptionalMobileImagePath    https://images.contentstack.io/v3/assets/blte4...
clean_text                 This Week at Bungie, we are saying goodbye to ...
Name: 0, dtype: object

In [34]:
df.iloc[100]["clean_text"]

'This week at Bungie, we’re prepping for our big Showcase event later this month, detailed in an earlier TWAB. We’re also working with the community for a nifty new Exotic, detailing what’s next for Iron Banner in Season 18, announcing Rift coming back to regular Crucible, and giving one last call for that glowy glamour goodness with the Solstice event that’s nearing its end.\xa0As we’ve already mentioned, we’re not going to be dropping any major news between now and the August 23 Showcase, but that doesn’t mean we don’t have anything to chat about. For this week, we’re keeping things a little lighter, so let’s start with what ol’ Saladin (oh, excuse me, I mean Valus Forge) has up his sleeves for us with Iron Banner returning next Season.\xa0\xa0Ready for this week’s TWAB? Let’s get into it.\xa0\xa0Iron Banner 4-1-1More Iron Banner is on the way next Season, this time sans Rift. To talk about more about what’s on the horizon, we’ve got Principal Designer Alan Blaine here to dive a litt

In [43]:
# Create a new column 'preproc_text' in the DataFrame
df['preproc_text'] = ''

# Convert text to lowercase
df['preproc_text'] = df['clean_text'].str.lower()

# Remove special characters and numbers
df['preproc_text'] = df['preproc_text'].apply(lambda text: re.sub(r'[^a-zA-Z\'-]', ' ', text))

# Tokenization
df['preproc_text'] = df['preproc_text'].apply(lambda text: nltk.word_tokenize(text))

# Remove stopwords
stopwords_set = set(stopwords.words('english'))
df['preproc_text'] = df['preproc_text'].apply(lambda tokens: [token for token in tokens if token not in stopwords_set])

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['preproc_text'] = df['preproc_text'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

# Join tokens back into sentences
df['preproc_text'] = df['preproc_text'].apply(lambda tokens: ' '.join(tokens))


In [45]:
df.iloc[0]["preproc_text"]

"week bungie saying goodbye season defiance welcoming season deep word 's last week season ton new content dropping le five day let 's start right foot recalling talked last week shall last week topic preview season armor weapon second week guardian game went teased new dungeon eververse got storefront update 's menu today overview major change coming season first look three new strand aspect in-depth preview seasonal artifact update enemy shield look note engineering team updated standing guardian game guardian game cup introducing asian bungie new twitch bounty emblem know 's coming let 's scuba dive twab shall tl dr list season change season deep almost upon u information already shared might need refresher worry help economy update economy might sound boring first start noticing many thing game depend well start paying attention fluctuates start season several update destiny economy read detail may twab recap major change rahool exotic engram focusing decrypt focus exotic engram ti