In [8]:
import sys
!{sys.executable} -m pip install textstat
!{sys.executable} -m pip install beautifulsoup4
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install nltk
!{sys.executable} -m pip install lxml
!{sys.executable} -m pip install html5lib
!{sys.executable} -m pip install tabulate
!{sys.executable} -m pip install textblob

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[3

In [11]:
import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd  # For data handling
import textstat
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
import lxml
import os

# Ensure the VADER lexicon is downloaded
nltk.download('vader_lexicon')

# Function to extract text chunks from the webpage, clean them, and compute readability and sentiment statistics
def fetch_and_analyze_readability(url, website_type, stem, department, scraping):
    try:
        # Send a GET request to the URL
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        page = response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page {url}: {e}")
        return pd.DataFrame()  # Return empty DataFrame

    # Parse the HTML page
    soup = BeautifulSoup(page, "lxml")

    # Remove unwanted elements like scripts, styles, and comments
    for script in soup(["script", "style", "noscript"]):
        script.extract()  # Remove these elements from the soup

    # Remove HTML comments
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        comment.extract()

    # Extract text chunks from paragraphs and headings
    text_chunks = []
    for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        text = element.get_text(strip=True)
        if text:  # Ensure it's not empty
            text_chunks.append(text)

    # Initialize sentiment analyzer
    sid = SentimentIntensityAnalyzer()

    # List to store results
    results = []

    # For each text chunk, compute statistics and sentiment
    for chunk in text_chunks:
        stats = compute_statistics(chunk)
        vader_sentiment = sid.polarity_scores(chunk)
        textblob_sentiment = TextBlob(chunk).sentiment.polarity
        
        result = {
            'URL': url,
            'Website_Type': website_type,
            'STEM': stem,
            'Department': department,
            # 'Subset': subset,
            # 'OutsideDoc': outsidedoc,
            'Time_to_Scrape': scraping,
            # 'Date_Collected': date,
            'Text Chunk': chunk,
            **stats,
            'VADER Sentiment Score': vader_sentiment['compound'],
            'TextBlob Sentiment Score': textblob_sentiment,
        }
        results.append(result)

    # Create DataFrame from results
    df = pd.DataFrame(results)
    return df

# Function to compute readability scores using textstat
def compute_statistics(text):
    stats = {}
    # Calculate various readability metrics
    stats['Flesch Reading Ease'] = textstat.flesch_reading_ease(text)
    stats['Flesch-Kincaid Grade Level'] = textstat.flesch_kincaid_grade(text)
    # stats['SMOG Index'] = textstat.smog_index(text)
    # stats['Gunning Fog Index'] = textstat.gunning_fog(text)
    # stats['Automated Readability Index'] = textstat.automated_readability_index(text)
    # stats['Coleman Liau Index'] = textstat.coleman_liau_index(text)
    # stats['Dale-Chall Readability Score'] = textstat.dale_chall_readability_score(text)
    # stats['Linsear Write Formula'] = textstat.linsear_write_formula(text)
    stats['Difficult Words'] = textstat.difficult_words(text)
    stats['Total Number of Sentences'] = textstat.sentence_count(text)
    stats['Total Number of Words'] = textstat.lexicon_count(text)
    return stats

# Function to read URLs from a newline-separated file and analyze each one
def analyze_urls_from_file(filename):
    try:
        df_urls = pd.read_csv(filename)
        
        # Filter rows where "Website_Type" is one of the specified values
        valid_types = ["Prospective students", "Advising", "Undergraduate Research"]
        df_urls = df_urls[df_urls['Website_Type'].isin(valid_types)]

        # Filter rows where "STEM" is "Yes"
        df_urls = df_urls[df_urls['STEM'] == "Y"]
        
        # Extract the "Institution" value from the pathname of input csv
        institution = os.path.basename(filename).replace("Website_Analysis_Tracking - ", "").replace(".csv", "").strip()
        print(f"Institution extracted from filename: {institution}")

        # Extract cols from the filtered DataFrame
        urls = df_urls['URL'].dropna().tolist()
        website_types = df_urls['Website_Type'].tolist()
        stem = df_urls['STEM'].dropna().tolist()
        department = df_urls['Department'].dropna().tolist()
        # subset = df_urls['Subset'].dropna().tolist()
        # outsidedoc = df_urls['OutsideDoc'].dropna().tolist()
        scrapingtime = df_urls['Time_to_Scrape'].dropna().tolist()
        # date = df_urls['Date_Collected'].dropna().tolist()

    except Exception as e:
        print(f"Error reading the CSV file: {e}")
        return

    # Initialize an empty list to collect DataFrames
    all_results = []

    # Iterate over each URL and analyze readability
    for index, (url, website_type, s, d, st) in enumerate(zip(urls, website_types, stem, department, scrapingtime)):
        print(f"Processing URL {index + 1}: {url} (Type: {website_type})")
        df = fetch_and_analyze_readability(url, website_type, s, d, st)
        if not df.empty:
            all_results.append(df)

    # Concatenate all DataFrames
    if all_results:
        final_df = pd.concat(all_results, ignore_index=True)

        # Construct the output filename using the "Institution" value
        output_filename = f"{institution}_readability_analysis.csv"

        # Output the DataFrame into a markdown file
        final_df.to_csv(output_filename, index=False)
        print(f"Analysis complete. Results saved to {output_filename}.")
    else:
        print("No results to display.")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/keyapanchal/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [15]:
# Example of running the analysis with a file
filename = input("Please provide a CSV file containing URLs. This file should be within the data folder: ")  # Path to the file containing the URLs
analyze_urls_from_file(filename)

Institution extracted from filename: UC Berkeley
Processing URL 1: https://are.berkeley.edu/eep/hear-from-our-students (Type: Prospective students)


  for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):


Processing URL 2: https://are.berkeley.edu/eep/student-outreach (Type: Advising)
Processing URL 3: https://are.berkeley.edu/eep/research-opportunities (Type: Undergraduate Research)
Processing URL 4: https://ced.berkeley.edu/arch/advising (Type: Prospective students)
Processing URL 5: https://astro.berkeley.edu/prospective-students/ (Type: Advising)
Error fetching the page https://astro.berkeley.edu/prospective-students/: 404 Client Error: Not Found for url: https://astro.berkeley.edu/prospective-students/
Processing URL 6: https://astro.berkeley.edu/programs/undergraduate-program/undergraduate-resources/ (Type: Undergraduate Research)
Error fetching the page https://astro.berkeley.edu/programs/undergraduate-program/undergraduate-resources/: 404 Client Error: Not Found for url: https://astro.berkeley.edu/programs/undergraduate-program/undergraduate-resources/
Processing URL 7: https://astro.berkeley.edu/research-facilities/research-opportunities/ (Type: Prospective students)
Error fetc

  for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):


Processing URL 12: https://ib.berkeley.edu/undergrad/advising.php (Type: Undergraduate Research)
Processing URL 13: https://ib.berkeley.edu/undergrad/research.php (Type: Prospective students)
Processing URL 14: https://mcb.berkeley.edu/undergrad/prospective-students (Type: Advising)
Processing URL 15: https://mcb.berkeley.edu/undergrad/advising/advising-office/advising-services (Type: Undergraduate Research)
Processing URL 16: https://mcb.berkeley.edu/undergrad/research (Type: Prospective students)
Processing URL 17: https://physics.berkeley.edu/student-life/student-support-advising (Type: Advising)
Processing URL 18: https://physics.berkeley.edu/academics/undergraduate-research (Type: Undergraduate Research)
Processing URL 19: https://chemistry.berkeley.edu/ugrad/prospective-students (Type: Prospective students)
Processing URL 20: https://chemistry.berkeley.edu/ugrad/student-services (Type: Advising)
Processing URL 21: https://chemistry.berkeley.edu/ugrad/prospective-students (Type: U

  for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):


Processing URL 28: https://statistics.berkeley.edu/academics/undergrad/advising (Type: Prospective students)
Processing URL 29: https://statistics.berkeley.edu/academics/undergrad/research (Type: Advising)
Processing URL 30: https://eps.berkeley.edu/admissions (Type: Undergraduate Research)
Processing URL 31: https://eps.berkeley.edu/student-resources/contact-student-services-advisors (Type: Prospective students)
Processing URL 32: https://econ.berkeley.edu/undergraduate/admissions (Type: Advising)
Processing URL 33: https://econ.berkeley.edu/undergraduate/advising (Type: Undergraduate Research)
Processing URL 34: https://mse.berkeley.edu/mse-major/ (Type: Prospective students)
Error fetching the page https://mse.berkeley.edu/mse-major/: 403 Client Error: Forbidden for url: https://mse.berkeley.edu/mse-major/
Processing URL 35: https://mse.berkeley.edu/advising/ (Type: Advising)
Error fetching the page https://mse.berkeley.edu/advising/: 403 Client Error: Forbidden for url: https://mse

  for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):


Processing URL 41: https://geography.berkeley.edu/academics/undergraduate-studies/advising (Type: Advising)
Processing URL 42: https://ib.berkeley.edu/undergrad/whatisib.php (Type: Undergraduate Research)
Processing URL 43: https://ib.berkeley.edu/undergrad/advising.php (Type: Prospective students)
Processing URL 44: https://ib.berkeley.edu/undergrad/research.php (Type: Advising)
Processing URL 45: https://math.berkeley.edu/undergraduate/advising (Type: Undergraduate Research)
Processing URL 46: https://math.berkeley.edu/undergraduate/undergraduate-research-opportunities (Type: Prospective students)
Processing URL 47: https://nature.berkeley.edu/research/undergraduate-research (Type: Advising)
Processing URL 48: https://mcb.berkeley.edu/undergrad/prospective-students (Type: Undergraduate Research)
Processing URL 49: https://mcb.berkeley.edu/undergrad/advising/advising-office/advising-services (Type: Prospective students)
Processing URL 50: https://mcb.berkeley.edu/undergrad/research (T