In [8]:
import sys
!{sys.executable} -m pip install textstat
!{sys.executable} -m pip install beautifulsoup4
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install nltk
!{sys.executable} -m pip install lxml
!{sys.executable} -m pip install html5lib
!{sys.executable} -m pip install tabulate
!{sys.executable} -m pip install textblob

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[3

In [14]:
import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd  # For data handling
import textstat
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
import lxml
import os

# Ensure the VADER lexicon is downloaded
nltk.download('vader_lexicon')

# Function to extract text chunks from the webpage, clean them, and compute readability and sentiment statistics
def fetch_and_analyze_readability(url, website_type, stem, department, scraping, institution):
    try:
        # Send a GET request to the URL
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        page = response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page {url}: {e}")
        return pd.DataFrame()  # Return empty DataFrame

    # Parse the HTML page
    soup = BeautifulSoup(page, "lxml")

    # Remove unwanted elements like scripts, styles, and comments
    for script in soup(["script", "style", "noscript"]):
        script.extract()

    # Remove HTML comments
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        comment.extract()

    # Extract text chunks from paragraphs and headings
    text_chunks = []
    for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        text = element.get_text(strip=True)
        if text:  # Ensure it's not empty
            text_chunks.append(text)

    # Initialize sentiment analyzer
    sid = SentimentIntensityAnalyzer()

    # List to store results
    results = []

    # For each text chunk, compute statistics and sentiment
    for chunk in text_chunks:
        stats = compute_statistics(chunk)
        vader_sentiment = sid.polarity_scores(chunk)
        textblob_sentiment = TextBlob(chunk).sentiment.polarity
        
        result = {
            'URL': url,
            'Institution': institution,  # Include Institution
            'Department': department,    # Include Department
            'Website_Type': website_type,
            'STEM': stem,
            'Time_to_Scrape': scraping,
            'Text Chunk': chunk,
            **stats,
            'VADER Sentiment Score': vader_sentiment['compound'],
            'TextBlob Sentiment Score': textblob_sentiment
        }
        results.append(result)

    # Create DataFrame from results
    df = pd.DataFrame(results)
    return df

# Function to compute readability scores using textstat
def compute_statistics(text):
    stats = {}
    # Calculate various readability metrics
    stats['Flesch Reading Ease'] = textstat.flesch_reading_ease(text)
    stats['Flesch-Kincaid Grade Level'] = textstat.flesch_kincaid_grade(text)
    stats['Difficult Words'] = textstat.difficult_words(text)
    stats['Total Number of Sentences'] = textstat.sentence_count(text)
    stats['Total Number of Words'] = textstat.lexicon_count(text)
    return stats

# Function to read URLs from a newline-separated file and analyze each one
def analyze_urls_from_file(filename):
    try:
        df = pd.read_csv(filename)
        
        # Filter rows where "Website_Type" is one of the specified values
        valid_types = ["Prospective students", "Advising", "Undergraduate Research"]
        df = df[df['Website_Type'].isin(valid_types)]

        # Filter rows where "STEM" == "Y" and "Outside_Dept" == "Y"
        df = df[(df['STEM'] == "Y") & (df['OutsideDoc'] != "Y")]
        
    except Exception as e:
        print(f"Error reading the CSV file: {e}")
        return

    # Initialize an empty list to collect DataFrames
    all_results = []

    # Iterate over each row in the filtered DataFrame
    for index, row in df.iterrows():
        url = row['URL']
        website_type = row['Website_Type']
        stem = row['STEM']
        department = row['Department']
        scraping = row['Time_to_Scrape']
        institution = row['Institution']

        # Skip rows with missing URLs
        if pd.isna(url) or url == "":
            print(f"Skipping row {index + 1} due to missing URL.")
            continue

        print(f"Processing URL {index + 1}: {url} (Type: {website_type}, Institution: {institution})")
        analyzed_df = fetch_and_analyze_readability(url, website_type, stem, department, scraping, institution)
        if not analyzed_df.empty:
            all_results.append(analyzed_df)

    # Concatenate all DataFrames
    if all_results:
        final_df = pd.concat(all_results, ignore_index=True)

        # Construct the output filename using the input filename
        institution_filename = os.path.basename(filename).replace("Website_Analysis_Tracking - ", "").replace(".csv", "").strip()
        output_filename = f"{institution_filename}_readability_analysis.csv"

        # Output the DataFrame into a CSV file
        final_df.to_csv(output_filename, index=False)
        print(f"Analysis complete. Results saved to {output_filename}.")
    else:
        print("No results to display.")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/keyapanchal/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [19]:
# Example of running the analysis with a file
filename = input("Please provide a CSV file containing URLs. This file should be within the data folder: ")
analyze_urls_from_file(filename)

Processing URL 3: https://anthropology.ucsd.edu/undergraduate-studies/prospective-transfer-students.html (Type: Prospective students, Institution: UC San Diego)


  for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):


Processing URL 4: https://anthropology.ucsd.edu/about-us/contact/advising-hours.html (Type: Advising, Institution: UC San Diego)
Error fetching the page https://anthropology.ucsd.edu/about-us/contact/advising-hours.html: 404 Client Error: Not Found for url: https://anthropology.ucsd.edu/about-us/contact/advising-hours.html
Skipping row 6 due to missing URL.
Processing URL 23: https://astro.ucsd.edu/undergraduate/incoming-students/index.html (Type: Prospective students, Institution: UC San Diego)


  for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):


Processing URL 24: https://astro.ucsd.edu/undergraduate/academic-advising/index.html (Type: Advising, Institution: UC San Diego)
Processing URL 26: https://astro.ucsd.edu/undergraduate/research-opportunities/index.html (Type: Undergraduate Research, Institution: UC San Diego)
Processing URL 33: https://be.ucsd.edu/undergrad/prospective-students (Type: Prospective students, Institution: UC San Diego)
Processing URL 34: https://be.ucsd.edu/undergrad/advising (Type: Advising, Institution: UC San Diego)
Processing URL 36: https://be.ucsd.edu/undergrad/research (Type: Undergraduate Research, Institution: UC San Diego)
Processing URL 43: https://biology.ucsd.edu/education/undergrad/admission/index.html (Type: Prospective students, Institution: UC San Diego)
Processing URL 44: https://biology.ucsd.edu/education/undergrad/advising/index.html (Type: Advising, Institution: UC San Diego)
Processing URL 46: https://biology.ucsd.edu/education/undergrad/research/index.html (Type: Undergraduate Resea

  for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):


Processing URL 276: https://physics.ucsd.edu/students/undergraduate/advising (Type: Advising, Institution: UC San Diego)
Processing URL 278: https://physics.ucsd.edu/students/undergraduate/research (Type: Undergraduate Research, Institution: UC San Diego)
Processing URL 295: https://psychology.ucsd.edu/undergraduate-program/new-students/index.html (Type: Prospective students, Institution: UC San Diego)
Processing URL 296: https://psychology.ucsd.edu/undergraduate-program/advising/index.html (Type: Advising, Institution: UC San Diego)
Processing URL 298: https://psychology.ucsd.edu/undergraduate-program/research/index.html (Type: Undergraduate Research, Institution: UC San Diego)
Analysis complete. Results saved to UCSD_readability_analysis.csv.
