In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import gspread
from google.oauth2.service_account import Credentials
from gspread_dataframe import set_with_dataframe

In [23]:
def get_english_dictionary():
    """Retrieve English dictionary from GitHub."""
    dict_url = "https://github.com/dwyl/english-words/raw/master/words_alpha.txt"
    response = requests.get(dict_url)
    # Strip both \n and \r characters
    words = set(word.strip() for word in response.text.lower().strip().split('\n'))
    return words

def return_words(web_page):
    """Count English words on a web page."""
    try:
        # Fetch and parse the web page
        response = requests.get(web_page)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract text and clean it
        text = soup.get_text()
        text = text.lower()
        text = re.sub(r'[",:.\\n]', '', text)
        
        # Split into words and count those in the dictionary
        page_words = text.split()

        english_count = sum(1 for word in page_words if word in english_dictionary)
        return english_count
    except:
        return 0

def main():
    """Main function to process URLs and update Google Sheet."""
    # Set up Google Sheets authentication
    # You'll need to create a service account and download credentials.json
    scopes = [
        'https://www.googleapis.com/auth/spreadsheets',
        'https://www.googleapis.com/auth/drive'
    ]
    
    credentials = Credentials.from_service_account_file(
        'gtm-543z2mjn-yjqzn-f18077312981.json',
        scopes=scopes
    )
    
    gc = gspread.authorize(credentials)
    
    # Open the sheet
    sheet_url = 'https://docs.google.com/spreadsheets/d/1z8aZvcrRkiHb9hvb29gD-q8dUXsDDQXBSKxhGKXUR5Q/edit?usp=sharing'
    spreadsheet = gc.open_by_url(sheet_url)
    
    # Read data from the sheet
    worksheet = spreadsheet.sheet1  # Assuming URLs are in the first sheet
    page_data = pd.DataFrame(worksheet.get_all_records())
    
    # Get English dictionary
    global english_dictionary
    english_dictionary = get_english_dictionary()
        
    # Process each URL and count words
    page_data['words'] = page_data['page_location'].apply(return_words)
    
    print(page_data)
    
    # Write results to a new sheet
    try:
        output_sheet = spreadsheet.worksheet('output')
    except:
        output_sheet = spreadsheet.add_worksheet(title='output', rows=100, cols=20)
    
    set_with_dataframe(output_sheet, page_data)
    print("Processing complete. Results written to 'output' sheet.")

if __name__ == "__main__":
    main()

                                        page_location  scrolls  words
0   https://news.wm.edu/2025/04/08/william-mary-co...     5765    724
1   https://news.wm.edu/announcements/notice-to-th...     5131    341
2   https://news.wm.edu/2025/04/08/william-mary-co...     4428    724
3                                https://news.wm.edu/     4026    485
4   https://news.wm.edu/2025/04/09/roll-out-the-gr...     2611    732
..                                                ...      ...    ...
94  https://news.wm.edu/2025/04/18/william-marys-b...      179    902
95  https://news.wm.edu/2025/04/16/virginias-exper...      178    756
96  https://news.wm.edu/2025/03/31/ncis-leader-alu...      175    839
97  https://news.wm.edu/announcements/notice-to-th...      169    289
98  https://news.wm.edu/2025/04/21/sean-galloway-s...      163    751

[99 rows x 3 columns]
Processing complete. Results written to 'output' sheet.
