In [6]:
import requests
import json
from pprint import pprint
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


api_key = "aebb0f3980c0466f8196511f2131b8fc"
endpoint = "https://www.bungie.net/Platform/Content/Rss/NewsArticles/{pageToken}/"
page_token = "0"
include_body = True
headers = {
    "X-API-Key": api_key
}
params = {
    "includebody": include_body
}

results = []

while page_token is not None:
    response = requests.get(endpoint.format(pageToken=page_token), headers=headers, params=params)

    if response.status_code == 200:
        json_response = response.json()
        if 'NewsArticles' in json_response['Response']:
            results.extend(json_response['Response']['NewsArticles'])
        else:
            print("No NewsArticles found in the response.")
            break
        page_token = json_response['Response']['NextPaginationToken']
    else:
        print("Error:", response.status_code)
        print("Response:", response.text)
        break



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jorda\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jorda\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jorda\AppData\Roaming\nltk_data...


KeyError: 'NextPaginationToken'

In [7]:
print(f"Total Results: {len(results)}")

Total Results: 1006


In [8]:
# Convert the results list into a DataFrame
df = pd.DataFrame(results)

# Display the DataFrame
df.iloc[0]


Title                                      Community Focus - Claire Corcoran
Link                        /7/en/News/Article/community-focus-cosplay-corco
PubDate                                                 2023-05-19T00:00:00Z
UniqueIdentifier                                         bltc1422795bcebd539
Description                Have you ever wanted to be an Eliksni? How abo...
HtmlContent                <p>Community Focus time? Community Focus time....
ImagePath                  https://images.contentstack.io/v3/assets/blte4...
OptionalMobileImagePath    https://images.contentstack.io/v3/assets/blte4...
Name: 0, dtype: object

In [9]:
selected_columns = ['Title', 'Description', 'PubDate']

# Display the selected columns DataFrame
print(df[selected_columns].iloc[0])

Title                          Community Focus - Claire Corcoran
Description    Have you ever wanted to be an Eliksni? How abo...
PubDate                                     2023-05-19T00:00:00Z
Name: 0, dtype: object


In [10]:
df["Title"]

0           Community Focus - Claire Corcoran
1            This Week At Bungie - 05/18/2023
2            Season 21 Weapon Changes Preview
3             This Week At Bungie – 5/11/2023
4       Season 21 Exotic Armor Tuning Preview
                        ...                  
1001              Potentially Asked Questions
1002                Inside the new Bungie.net
1003                           This is a Test
1004       Happy 10th Anniversary, Xbox LIVE!
1005                 Breaking In - Adam Brown
Name: Title, Length: 1006, dtype: object

In [12]:
from bs4 import BeautifulSoup

# Assuming you have an HTML string stored in the 'html' variable
html = df.iloc[0]["HtmlContent"]

# Create a BeautifulSoup object to parse the HTML
soup = BeautifulSoup(html, 'html.parser')

# Retrieve the text content using the .get_text() method
text_content = soup.get_text()

# Print the extracted text content
print(text_content)


Community Focus time? Community Focus time. This time, we’re diving back into the epic world of cosplay to chat with one creative Guardian that loves to put her best Eliksni foot forward. And other feet, obviously, but Eliksni... yeah, you’ll see what we’re talking about here in today’s blog post.Meet Claire Corcoran, a cosplayer that shows just how cool it would be if Destiny 2 joined us in the real world while also inviting us to question our own sanity when asking us one very important question: why do I want to be an Eliksni so bad after seeing her in a full, completely realistic costume?!Let’s dive in, shall we?You are making my day by sharing more of your cosplay with us, thank you for joining the Community Focus family! Before we start, tell us a little more about yourself and how you got into Destiny and the wonder that is cosplay.Thank you for reaching out, this is so flattering! Hi, I’m Claire! I guess the biggest thing about me is I’ve always been into art and fantasy and so

In [13]:
# Create a new column 'clean_text' in the DataFrame
df['clean_text'] = ''

# Iterate over the rows of the DataFrame
for index, row in df.iterrows():
    html_content = row['HtmlContent']
    
    # Create a BeautifulSoup object to parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Retrieve the text content using the .get_text() method
    clean_text = soup.get_text().strip()  # Apply strip() to remove leading/trailing whitespaces
    
    # Assign the clean text to the 'clean_text' column of the current row
    df.at[index, 'clean_text'] = clean_text

df.iloc[0]

Title                                      Community Focus - Claire Corcoran
Link                        /7/en/News/Article/community-focus-cosplay-corco
PubDate                                                 2023-05-19T00:00:00Z
UniqueIdentifier                                         bltc1422795bcebd539
Description                Have you ever wanted to be an Eliksni? How abo...
HtmlContent                <p>Community Focus time? Community Focus time....
ImagePath                  https://images.contentstack.io/v3/assets/blte4...
OptionalMobileImagePath    https://images.contentstack.io/v3/assets/blte4...
clean_text                 Community Focus time? Community Focus time. Th...
Name: 0, dtype: object

In [14]:
df.iloc[100]["clean_text"]

"Happy Friday, Guardians! Welcome back to one of our favorite segments on Bungie.net, this week’s Community Focus!\xa0 \xa0I know I tend to say this almost every time I write something here, but it still blows my mind that (for those of us reading this) we all have found each other because of this game. We have our “in real life” moments: graduating high school, moving out for the first time, finding a partner, or even having children. And there is something special about the fact that a lot of these moments are also shared in-game with our fireteam, clan, and community. So, thank you once again for sharing these stories with us, and please continue to do so because we really love writing them.\xa0Okay, enough cheesiness for now. This week we are hanging out over in the UK with an Aussie you may know!\xa0\xa0Hello and welcome! First, let's start with your name, pronouns, and a little bit about you.\xa0\xa0G’day! I am BonaFideHiro, he/him, but everyone calls me Bona or Hiro, but never F

In [15]:
# Create a new column 'preproc_text' in the DataFrame
df['preproc_text'] = ''

# Convert text to lowercase
df['preproc_text'] = df['clean_text'].str.lower()

# Remove special characters and numbers
df['preproc_text'] = df['preproc_text'].apply(lambda text: re.sub(r'[^a-zA-Z\'-]', ' ', text))

# Tokenization
df['preproc_text'] = df['preproc_text'].apply(lambda text: nltk.word_tokenize(text))

# Remove stopwords
stopwords_set = set(stopwords.words('english'))
df['preproc_text'] = df['preproc_text'].apply(lambda tokens: [token for token in tokens if token not in stopwords_set])

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['preproc_text'] = df['preproc_text'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

# Join tokens back into sentences
df['preproc_text'] = df['preproc_text'].apply(lambda tokens: ' '.join(tokens))


In [16]:
df.iloc[0]["preproc_text"]

"community focus time community focus time time diving back epic world cosplay chat one creative guardian love put best eliksni foot forward foot obviously eliksni yeah see talking today blog post meet claire corcoran cosplayer show cool would destiny joined u real world also inviting u question sanity asking u one important question want eliksni bad seeing full completely realistic costume let dive shall making day sharing cosplay u thank joining community focus family start tell u little got destiny wonder cosplay thank reaching flattering hi claire guess biggest thing always art fantasy sought way merge two day art teacher night gamer cosplayer getting cosplay probably inevitable took long time kid loved wearing costume playing pretend brother sister would make prop weapon fort good stuff really liked let creativity run wild college learned prop fabrication film student jump cosplay happened watched transformer prime loved character starscream much decided take leap build costume ye