Load API key from .env

In [60]:
import os
from dotenv import load_dotenv

load_dotenv()

API_KEY = os.getenv("API_KEY")


Request items from API

In [61]:

import requests
import json

URL = "https://drvk.createuky.net/news-articles/api/items"
r = requests.get(URL)


Grab descriptions from each item, clean them

In [62]:
import re

newline_pattern = re.compile(r'\n+')
html_pattern = re.compile(r'<.*?>')
nbsp_pattern = re.compile(r'&nbsp;')

def remove_tags(text):
    no_newlines = newline_pattern.sub('', text)
    no_tags = html_pattern.sub('', no_newlines)
    return nbsp_pattern.sub(' ', no_tags)

items = json.loads(r.text)

cleaned_text = []
for item in items:
    for text in item["element_texts"]:
        if text["element"]["name"] == "Description":
            cleaned = remove_tags(text["text"]).replace("\n", "")
            if len(cleaned): cleaned_text.append(cleaned)


Imports for frequency analysis

In [None]:
import nltk
from nltk import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download("stopwords")

In [None]:
# join every cleaned description into a string 
text = " ".join(cleaned_text)

# tokenize
words = word_tokenize(text)
cleaned = []

# remove punctuation and stop words
stopwords_list = stopwords.words("english")
custom_stopwords = []
stopwords_list.extend(custom_stopwords)

for word in words:
    if word.isalpha() and word.lower() not in stopwords_list:
        cleaned.append(word.lower())


print(f"Total number of words is {len(cleaned)}")

fdist = FreqDist(cleaned)

for word, freq in fdist.most_common():
    print(word, freq)
