# extracts all html files from a directory, opens them, gets the text body, and compiles everything into a txt file
# also gets the most frequent words and their count
from bs4 import BeautifulSoup
from collections import Counter
import os, glob
dir_path = 'C:\\Users\\Leon\\Desktop\\posts' # path of posts
result_file = open("results.txt", "wb") # write, binary
# os.path.join to get all files in that dir with *.html; glob to find the files
for file_name in glob.glob(os.path.join(dir_path, "*.html")):
with open(file_name, encoding="utf-8") as html_file:
# use bs4 to parse the html doc, get the readable text
# then write to result_file
soup = BeautifulSoup(html_file, "html.parser")
text_body = soup.get_text()
# get the top 100 words, and their count, in 2 separate lists
with open(os.path.join(dir_path, "results.txt"), encoding="utf-8") as file:
text =
word_counts = Counter(text)
top_list = dict(word_counts.most_common(100))
(keys, values) = zip(*top_list.items())