In [6]:
import sys
!{sys.executable} -m pip install textstat
!{sys.executable} -m pip install beautifulsoup4
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install nltk
!{sys.executable} -m pip install lxml
!{sys.executable} -m pip install html5lib
!{sys.executable} -m pip install tabulate

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [None]:
import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd  # For data handling
import textstat
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import lxml

# Ensure the VADER lexicon is downloaded
nltk.download('vader_lexicon')

# Function to extract text chunks from the webpage, clean them, and compute readability and sentiment statistics
def fetch_and_analyze_readability(url, website_type):
    try:
        # Send a GET request to the URL
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        page = response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page {url}: {e}")
        return pd.DataFrame()  # Return empty DataFrame

    # Parse the HTML page
    soup = BeautifulSoup(page, "lxml")

    # Remove unwanted elements like scripts, styles, and comments
    for script in soup(["script", "style", "noscript"]):
        script.extract()  # Remove these elements from the soup

    # Remove HTML comments
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        comment.extract()

    # Extract text chunks from paragraphs and headings
    text_chunks = []
    for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        text = element.get_text(strip=True)
        text_element = element
        if text:  # Ensure it's not empty
            text_chunks.append(text)

    # Initialize sentiment analyzer
    sid = SentimentIntensityAnalyzer()

    # List to store results
    results = []

    # For each text chunk, compute statistics and sentiment
    for chunk in text_chunks:
        stats = compute_statistics(chunk)
        sentiment = sid.polarity_scores(chunk)
        # Determine sentiment category
        if sentiment['compound'] >= 0.05:
            sentiment_category = 'Positive'
        elif sentiment['compound'] <= -0.05:
            sentiment_category = 'Negative'
        else:
            sentiment_category = 'Neutral'
        # Combine all results into a dictionary
        result = {
            'URL': url,
            'Website_Type': website_type,
            'Text Chunk': chunk,
            'HTML Element': text_element,
            **stats,  # Unpack the stats dictionary
            'Sentiment': sentiment['compound'],
            'Sentiment Category': sentiment_category
        }
        results.append(result)

    # Create DataFrame from results
    df = pd.DataFrame(results)
    return df

# Function to compute readability scores using textstat
def compute_statistics(text):
    stats = {}
    # Calculate various readability metrics
    stats['Flesch Reading Ease'] = textstat.flesch_reading_ease(text)
    stats['Flesch-Kincaid Grade Level'] = textstat.flesch_kincaid_grade(text)
    # stats['SMOG Index'] = textstat.smog_index(text)
    # stats['Gunning Fog Index'] = textstat.gunning_fog(text)
    # stats['Automated Readability Index'] = textstat.automated_readability_index(text)
    # stats['Coleman Liau Index'] = textstat.coleman_liau_index(text)
    # stats['Dale-Chall Readability Score'] = textstat.dale_chall_readability_score(text)
    # stats['Linsear Write Formula'] = textstat.linsear_write_formula(text)
    stats['Difficult Words'] = textstat.difficult_words(text)
    stats['Total Number of Sentences'] = textstat.sentence_count(text)
    stats['Total Number of Words'] = textstat.lexicon_count(text)
    return stats

# Function to read URLs from a newline-separated file and analyze each one
def analyze_urls_from_file(filename):
    try:
        df_urls = pd.read_csv(filename)
        
        # Filter rows where "Website_Type" is one of the specified values
        valid_types = ["Prospective students", "Advising", "Undergraduate Research"]
        df_urls = df_urls[df_urls['Website_Type'].isin(valid_types)]
        
        # Extract the "Institution" value from the first row
        if not df_urls.empty:
            institution = df_urls.iloc[0]['Institution']
        else:
            print("No valid rows found after filtering.")
            return
        
        # Extract URLs and Website_Type from the filtered DataFrame
        urls = df_urls['URL'].dropna().tolist()
        website_types = df_urls['Website_Type'].tolist()
    except Exception as e:
        print(f"Error reading the CSV file: {e}")
        return

    # Initialize an empty list to collect DataFrames
    all_results = []

    # Iterate over each URL and analyze readability
    for index, (url, website_type) in enumerate(zip(urls, website_types)):
        print(f"Processing URL {index + 1}: {url} (Type: {website_type})")
        df = fetch_and_analyze_readability(url, website_type)
        if not df.empty:
            all_results.append(df)

    # Concatenate all DataFrames
    if all_results:
        final_df = pd.concat(all_results, ignore_index=True)

        # Construct the output filename using the "Institution" value
        output_filename = f"{institution}_readability_analysis.csv"

        # Output the DataFrame into a markdown file
        final_df.to_csv(output_filename, index=False)
        print(f"Analysis complete. Results saved to {output_filename}.")
    else:
        print("No results to display.")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/keyapanchal/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [10]:
# Example of running the analysis with a file
filename = input("Please provide a CSV file containing URLs. This file should be within the data folder: ")  # Path to the file containing the URLs
analyze_urls_from_file(filename)

Processing URL 1: https://africam.berkeley.edu/


  for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):


Processing URL 2: https://africam.berkeley.edu/major-and-minor-requirements/
Processing URL 3: https://africam.berkeley.edu/undergraduate-program/undergraduate-advising/
Processing URL 4: https://africam.berkeley.edu/academic-opportunities/
Processing URL 5: https://africam.berkeley.edu/academic-opportunities/
Processing URL 6: https://africam.berkeley.edu/about-the-program/
Processing URL 7: https://africam.berkeley.edu/major-and-minor-requirements/
Processing URL 8: https://are.berkeley.edu/
Processing URL 9: https://are.berkeley.edu/eep/major
Processing URL 10: https://are.berkeley.edu/eep/hear-from-our-students
Processing URL 11: https://are.berkeley.edu/eep/student-outreach
Processing URL 12: https://are.berkeley.edu/about/research-areas
Processing URL 13: https://are.berkeley.edu/eep/research-opportunities
Processing URL 14: https://are.berkeley.edu/diversity-equity-inclusion
Processing URL 15: https://are.berkeley.edu/about/letter-chair
Processing URL 16: https://are.berkeley.ed

  for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):


Processing URL 63: https://astro.berkeley.edu/programs/undergraduate-program/
Processing URL 64: https://astro.berkeley.edu/prospective-students/
Processing URL 65: https://astro.berkeley.edu/programs/undergraduate-program/undergraduate-resources/
Processing URL 66: https://astro.berkeley.edu/research-facilities/research-opportunities/
Processing URL 67: https://astro.berkeley.edu/research-facilities/research-opportunities/
Processing URL 68: https://astro.berkeley.edu/about/diversity-and-climate/
Processing URL 69: https://astro.berkeley.edu/about/
Processing URL 70: https://astro.berkeley.edu/about/history/
Processing URL 71: https://astro.berkeley.edu/courses-category/undergraduate-spring-2020/
Processing URL 72: https://bioeng.berkeley.edu/
Error fetching the page https://bioeng.berkeley.edu/: 403 Client Error: Forbidden for url: https://bioeng.berkeley.edu/
Processing URL 73: https://bioeng.berkeley.edu/undergrad
Error fetching the page https://bioeng.berkeley.edu/undergrad: 403 C

  for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):


Processing URL 81: https://ib.berkeley.edu/undergrad
Processing URL 82: https://ib.berkeley.edu/undergrad/major/declaring.php
Processing URL 83: https://ib.berkeley.edu/undergrad/advising.php
Processing URL 84: https://ib.berkeley.edu/research
Processing URL 85: https://ib.berkeley.edu/undergrad/research.php
Processing URL 86: https://ib.berkeley.edu/diversity
Processing URL 87: https://ib.berkeley.edu/about
Processing URL 88: https://ib.berkeley.edu/academics/courses
Processing URL 89: https://mcb.berkeley.edu/
Processing URL 90: https://mcb.berkeley.edu/undergrad
Processing URL 91: https://mcb.berkeley.edu/undergrad/prospective-students
Processing URL 92: https://mcb.berkeley.edu/undergrad/advising/advising-office/advising-services
Processing URL 93: https://mcb.berkeley.edu/faculty-and-research
Processing URL 94: https://mcb.berkeley.edu/undergrad/research
Processing URL 95: https://mcb.berkeley.edu/diversity
Processing URL 96: https://mcb.berkeley.edu/about-the-department
Processin

  for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):


Processing URL 141: https://statistics.berkeley.edu/academics/undergrad/program
Processing URL 142: https://statistics.berkeley.edu/academics/undergrad/prospective
Processing URL 143: https://statistics.berkeley.edu/academics/undergrad/advising
Processing URL 144: https://statistics.berkeley.edu/research/overview
Processing URL 145: https://statistics.berkeley.edu/academics/undergrad/research
Processing URL 146: https://statistics.berkeley.edu/about/dei
Processing URL 147: https://statistics.berkeley.edu/about/overview
Processing URL 148: https://statistics.berkeley.edu/courses
Processing URL 149: https://eps.berkeley.edu/home
Processing URL 150: https://eps.berkeley.edu/student-resources/undergraduate-students
Processing URL 151: https://eps.berkeley.edu/admissions
Processing URL 152: https://eps.berkeley.edu/student-resources/contact-student-services-advisors
Processing URL 153: https://eps.berkeley.edu/research-news
Processing URL 154: https://eps.berkeley.edu/about/diversity-inclus

  for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):


Processing URL 156: https://eps.berkeley.edu/sample-landing-page
Processing URL 157: https://econ.berkeley.edu/
Processing URL 158: https://econ.berkeley.edu/undergraduate
Processing URL 159: https://econ.berkeley.edu/undergraduate/admissions
Processing URL 160: https://econ.berkeley.edu/undergraduate/advising
Processing URL 161: https://econ.berkeley.edu/research
Processing URL 162: https://econ.berkeley.edu/about/climate-equity-diversity-inclusion
Processing URL 163: https://econ.berkeley.edu/about
Processing URL 164: https://econ.berkeley.edu/courses
Processing URL 165: https://ieor.berkeley.edu/
Error fetching the page https://ieor.berkeley.edu/: 403 Client Error: Forbidden for url: https://ieor.berkeley.edu/
Processing URL 166: https://ieor.berkeley.edu/academics/#Undergraduate
Error fetching the page https://ieor.berkeley.edu/academics/#Undergraduate: 403 Client Error: Forbidden for url: https://ieor.berkeley.edu/academics/#Undergraduate
Processing URL 167: https://ieor.berkeley.

  for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):


Processing URL 195: https://ced.berkeley.edu/land/degrees-admissions/bachelor-of-arts-landscape-architecture
Processing URL 196: https://ced.berkeley.edu/land/degrees-admissions
Processing URL 197: https://ced.berkeley.edu/advising/undergraduate-advising
Processing URL 198: https://ced.berkeley.edu/land/about
Processing URL 199: https://ced.berkeley.edu/land/courses
Processing URL 200: https://ced.berkeley.edu/city
Processing URL 201: https://ced.berkeley.edu/city/degrees-admissions/bachelor-of-arts
Processing URL 202: https://ced.berkeley.edu/city/degrees-admissions
Processing URL 203: https://ced.berkeley.edu/city/advising
Processing URL 204: https://ced.berkeley.edu/city/about
Processing URL 205: https://ced.berkeley.edu/city/courses
Processing URL 206: https://geography.berkeley.edu/home
Processing URL 207: https://geography.berkeley.edu/academics/undergraduate-studies
Processing URL 208: https://geography.berkeley.edu/academics/undergraduate-studies/why-choose-geography
Processing