# Parsing Scraped HTML Files
After scraping all files, we will parse the files to collect the title, author, date, body, and category (which category on the website)

## Do not need to run this file, CSV is already created

In [None]:
import os
from bs4 import BeautifulSoup
import csv

# Define the path to the HTML files
path = 'the_hill_HTML/'

# Define the headers for the CSV file
headers = ['category', 'title', 'author', 'date', 'body']

# Open the CSV file for writing
with open('the_hill_articles.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(headers)
    
    article_count=0
    errors=0

    # Loop through each subfolder in the path
    for category in os.listdir(path):
        subfolder = os.path.join(path, category)

        # Loop through each HTML file in the subfolder
        for filename in os.listdir(subfolder):
            filepath = os.path.join(subfolder, filename)

            # Parse the HTML file with BeautifulSoup
            with open(filepath) as fp:
                soup = BeautifulSoup(fp, 'html.parser')

            # Extract the author, date, title, and body
            try:
                meta_section = soup.find('section', {'class': 'submitted-by | header__meta | text-transform-upper text-300 color-light-gray weight-semibold font-base desktop-only'})
                author = meta_section.find('a').text
                date = meta_section.find('span').text.strip().split('-')[-1].strip()
                header_section = soup.find('section', class_='article__header')
                title = header_section.find('h1', class_='page-title').text.strip()
                body = soup.select_one('.article__text').text.strip()

                # Write the extracted information to the CSV file
                writer.writerow([category, title, author, date, body])
                
                article_count += 1
                
            except Exception as e:
                print(f'Error extracting information from file {filepath}: {e}')
                
                errors+=1
                
    print('COMPLETE')
    
    print('Total Articles Collected: ', article_count)    
    
    print('Total Errors Collecting Articles: ', errors)
    
    


### Total Articles Collected: 8436
### Total Errors Collecting Articles: 61

# After Parsing HTML files into a csv, now we tokenize the body text and generate a random sample of 2000 sentences for our classifier

## Reading new CSV in

In [None]:
import pandas as pd
df=pd.read_csv('the_hill_articles.csv')


Many articles were missing the Author, but was replaced by the "Facebook" and "twitter" share icons, we can delete these from the dataframe

In [4]:
mask = df['author'] == '\n\n\nFacebook\n\n\n\n\n\nShare\n'
df.loc[mask, 'author'] = ''

## Tokenize body text into sentences

In [6]:
import nltk
from nltk.tokenize import sent_tokenize

In [7]:
df['sentences'] = df['body'].apply(sent_tokenize)


## Iterating through sentences to create new df with each sentence and category

In [9]:
# create an empty list to store the rows for the new dataframe
new_rows = []

# Iterate through each row in the original dataframe
for index, row in df.iterrows():
    category = row['category'] 
    sentences = row['sentences']
    
    # Iterate through each sentence in the 'sentences' column
    for sentence in sentences:
        new_rows.append({'category': category, 'sentence': sentence})

# Create the new dataframe using the new_rows list
new_df = pd.DataFrame(new_rows, columns=['category', 'sentence'])


In [19]:
# Many weird characters in sentences, encoding these with utf-8 to try to fix

import unicodedata

def normalize_text(text):
    return unicodedata.normalize('NFKD', text).encode('utf-8', 'ignore').decode('utf-8')


new_df['sentence'] = new_df['sentence'].apply(normalize_text)


## Getting a random sample of 2000 sentences

In [22]:
sample_df = new_df.sample(n=2000, random_state=1)

In [23]:
sample_df

Unnamed: 0,category,sentence
32372,business,Someone forward you this newsletter?
133229,opinion,"He was impeached (indicted, in a sense) for th..."
204539,policy,Crypto’s market cap is sitting right around $1...
47389,opinion,When it comes to fighting and ultimately defea...
122803,opinion,"Weapons of war, such as AR-style firearms and ..."
...,...,...
200868,policy,"Beyond seasonal mood changes, recent years hav..."
165693,opinion,Transmission is occurring mainly in health car...
25972,business,"), whose state has legalized recreational mari..."
133487,opinion,"Carolyn Kissane, Ph.D., is assistant dean of t..."


In [25]:
# Writing to csv
sample_df.to_csv('sample_sentences.csv', index=False)
