In [17]:
import sys
!{sys.executable} -m pip install textstat
!{sys.executable} -m pip install beautifulsoup4
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install nltk
!{sys.executable} -m pip install lxml
!{sys.executable} -m pip install html5lib
!{sys.executable} -m pip install tabulate
!{sys.executable} -m pip install textblob

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [18]:
import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd  # For data handling
import textstat
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
import lxml

# Ensure the VADER lexicon is downloaded
nltk.download('vader_lexicon')

# Function to extract text chunks from the webpage, clean them, and compute readability and sentiment statistics
def fetch_and_analyze_readability(url, website_type):
    try:
        # Send a GET request to the URL
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        page = response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page {url}: {e}")
        return pd.DataFrame()  # Return empty DataFrame

    # Parse the HTML page
    soup = BeautifulSoup(page, "lxml")

    # Remove unwanted elements like scripts, styles, and comments
    for script in soup(["script", "style", "noscript"]):
        script.extract()  # Remove these elements from the soup

    # Remove HTML comments
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        comment.extract()

    # Extract text chunks from paragraphs and headings
    text_chunks = []
    for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        text = element.get_text(strip=True)
        if text:  # Ensure it's not empty
            text_chunks.append(text)

    # Initialize sentiment analyzer
    sid = SentimentIntensityAnalyzer()

    # List to store results
    results = []

    # For each text chunk, compute statistics and sentiment
    for chunk in text_chunks:
        stats = compute_statistics(chunk)
        vader_sentiment = sid.polarity_scores(chunk)
        textblob_sentiment = TextBlob(chunk).sentiment.polarity
        
        result = {
            'URL': url,
            'Website_Type': website_type,
            'Text Chunk': chunk,
            **stats,
            'VADER Sentiment Score': vader_sentiment['compound'],
            'TextBlob Sentiment Score': textblob_sentiment,
        }
        results.append(result)
        # sentiment = sid.polarity_scores(chunk)
        # # Determine sentiment category
        # if sentiment['compound'] >= 0.05:
        #     sentiment_category = 'Positive'
        # elif sentiment['compound'] <= -0.05:
        #     sentiment_category = 'Negative'
        # else:
        #     sentiment_category = 'Neutral'
        # # Combine all results into a dictionary
        # result = {
        #     'URL': url,
        #     'Website_Type': website_type,
        #     'Text Chunk': chunk,
        #     **stats,  # Unpack the stats dictionary
        #     'Sentiment (VADER)': sentiment['compound'],
        #     'Sentiment (VADER) Category': sentiment_category
        # }
        # results.append(result)

    # Create DataFrame from results
    df = pd.DataFrame(results)
    return df

# Function to compute readability scores using textstat
def compute_statistics(text):
    stats = {}
    # Calculate various readability metrics
    stats['Flesch Reading Ease'] = textstat.flesch_reading_ease(text)
    stats['Flesch-Kincaid Grade Level'] = textstat.flesch_kincaid_grade(text)
    # stats['SMOG Index'] = textstat.smog_index(text)
    # stats['Gunning Fog Index'] = textstat.gunning_fog(text)
    # stats['Automated Readability Index'] = textstat.automated_readability_index(text)
    # stats['Coleman Liau Index'] = textstat.coleman_liau_index(text)
    # stats['Dale-Chall Readability Score'] = textstat.dale_chall_readability_score(text)
    # stats['Linsear Write Formula'] = textstat.linsear_write_formula(text)
    stats['Difficult Words'] = textstat.difficult_words(text)
    stats['Total Number of Sentences'] = textstat.sentence_count(text)
    stats['Total Number of Words'] = textstat.lexicon_count(text)
    return stats

# Function to read URLs from a newline-separated file and analyze each one
def analyze_urls_from_file(filename):
    try:
        df_urls = pd.read_csv(filename)
        
        # Filter rows where "Website_Type" is one of the specified values
        valid_types = ["Prospective students", "Advising", "Undergraduate Research"]
        df_urls = df_urls[df_urls['Website_Type'].isin(valid_types)]
        
        # Extract the "Institution" value from the first row
        if not df_urls.empty:
            institution = df_urls.iloc[0]['Institution']
        else:
            print("No valid rows found after filtering.")
            return
        
        # Extract URLs and Website_Type from the filtered DataFrame
        urls = df_urls['URL'].dropna().tolist()
        website_types = df_urls['Website_Type'].tolist()
    except Exception as e:
        print(f"Error reading the CSV file: {e}")
        return

    # Initialize an empty list to collect DataFrames
    all_results = []

    # Iterate over each URL and analyze readability
    for index, (url, website_type) in enumerate(zip(urls, website_types)):
        print(f"Processing URL {index + 1}: {url} (Type: {website_type})")
        df = fetch_and_analyze_readability(url, website_type)
        if not df.empty:
            all_results.append(df)

    # Concatenate all DataFrames
    if all_results:
        final_df = pd.concat(all_results, ignore_index=True)

        # Construct the output filename using the "Institution" value
        output_filename = f"{institution}_readability_analysis.csv"

        # Output the DataFrame into a markdown file
        final_df.to_csv(output_filename, index=False)
        print(f"Analysis complete. Results saved to {output_filename}.")
    else:
        print("No results to display.")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/keyapanchal/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [20]:
# Example of running the analysis with a file
filename = input("Please provide a CSV file containing URLs. This file should be within the data folder: ")  # Path to the file containing the URLs
analyze_urls_from_file(filename)

Processing URL 1: https://africam.berkeley.edu/undergraduate-program/undergraduate-advising/ (Type: Prospective students)


  for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):


Processing URL 2: https://africam.berkeley.edu/academic-opportunities/ (Type: Advising)
Processing URL 3: https://are.berkeley.edu/eep/hear-from-our-students (Type: Undergraduate Research)
Processing URL 4: https://are.berkeley.edu/eep/student-outreach (Type: Prospective students)
Processing URL 5: https://are.berkeley.edu/eep/research-opportunities (Type: Advising)
Processing URL 6: https://as.ugis.berkeley.edu/declaring-the-major/ (Type: Undergraduate Research)
Processing URL 7: https://as.ugis.berkeley.edu/program-resources/ (Type: Prospective students)
Processing URL 8: https://dagrs.berkeley.edu/index.php/graduate/prospective-students (Type: Advising)
Processing URL 9: https://dagrs.berkeley.edu/undergraduate/advising (Type: Undergraduate Research)
Processing URL 10: https://dagrs.berkeley.edu/undergraduate/research-opportunities-and-study-abroad (Type: Prospective students)
Processing URL 11: https://anthropology.berkeley.edu/undergraduate-program/prospective-students (Type: Advi

  for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):


Processing URL 18: https://astro.berkeley.edu/programs/undergraduate-program/undergraduate-resources/ (Type: Undergraduate Research)
Processing URL 19: https://astro.berkeley.edu/research-facilities/research-opportunities/ (Type: Prospective students)
Processing URL 20: https://bioeng.berkeley.edu/undergrad (Type: Advising)
Error fetching the page https://bioeng.berkeley.edu/undergrad: 403 Client Error: Forbidden for url: https://bioeng.berkeley.edu/undergrad
Processing URL 21: https://bioeng.berkeley.edu/undergrad/advising (Type: Undergraduate Research)
Error fetching the page https://bioeng.berkeley.edu/undergrad/advising: 403 Client Error: Forbidden for url: https://bioeng.berkeley.edu/undergrad/advising
Processing URL 22: https://bioeng.berkeley.edu/undergrad/undergradresearch (Type: Prospective students)
Error fetching the page https://bioeng.berkeley.edu/undergrad/undergradresearch: 403 Client Error: Forbidden for url: https://bioeng.berkeley.edu/undergrad/undergradresearch
Proce

  for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):


Processing URL 24: https://ib.berkeley.edu/undergrad/advising.php (Type: Undergraduate Research)
Processing URL 25: https://ib.berkeley.edu/undergrad/research.php (Type: Prospective students)
Processing URL 26: https://mcb.berkeley.edu/undergrad/prospective-students (Type: Advising)
Processing URL 27: https://mcb.berkeley.edu/undergrad/advising/advising-office/advising-services (Type: Undergraduate Research)
Processing URL 28: https://mcb.berkeley.edu/undergrad/research (Type: Prospective students)
Processing URL 29: https://physics.berkeley.edu/student-life/student-support-advising (Type: Advising)
Processing URL 30: https://physics.berkeley.edu/academics/undergraduate-research (Type: Undergraduate Research)
Processing URL 31: https://chemistry.berkeley.edu/ugrad/prospective-students (Type: Prospective students)
Processing URL 32: https://chemistry.berkeley.edu/ugrad/student-services (Type: Advising)
Processing URL 33: https://chemistry.berkeley.edu/ugrad/prospective-students (Type: U

  for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):


Processing URL 41: https://statistics.berkeley.edu/academics/undergrad/research (Type: Advising)
Processing URL 42: https://eps.berkeley.edu/admissions (Type: Undergraduate Research)
Processing URL 43: https://eps.berkeley.edu/student-resources/contact-student-services-advisors (Type: Prospective students)
Processing URL 44: https://econ.berkeley.edu/undergraduate/admissions (Type: Advising)
Processing URL 45: https://econ.berkeley.edu/undergraduate/advising (Type: Undergraduate Research)
Processing URL 46: https://mse.berkeley.edu/mse-major/ (Type: Prospective students)
Error fetching the page https://mse.berkeley.edu/mse-major/: 403 Client Error: Forbidden for url: https://mse.berkeley.edu/mse-major/
Processing URL 47: https://mse.berkeley.edu/advising/ (Type: Advising)
Error fetching the page https://mse.berkeley.edu/advising/: 403 Client Error: Forbidden for url: https://mse.berkeley.edu/advising/
Processing URL 48: https://me.berkeley.edu/undergraduate/prospective-students/ (Type:

  for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):


Processing URL 53: https://ced.berkeley.edu/advising/undergraduate-advising (Type: Advising)
Processing URL 54: https://ced.berkeley.edu/city/degrees-admissions (Type: Undergraduate Research)
Processing URL 55: https://ced.berkeley.edu/city/advising (Type: Prospective students)
Processing URL 56: https://geography.berkeley.edu/academics/undergraduate-studies/why-choose-geography (Type: Advising)
Processing URL 57: https://geography.berkeley.edu/academics/undergraduate-studies/advising (Type: Undergraduate Research)
Processing URL 58: https://ib.berkeley.edu/undergrad/whatisib.php (Type: Prospective students)
Processing URL 59: https://ib.berkeley.edu/undergrad/advising.php (Type: Advising)
Processing URL 60: https://ib.berkeley.edu/undergrad/research.php (Type: Undergraduate Research)
Processing URL 61: https://math.berkeley.edu/undergraduate/advising (Type: Prospective students)
Processing URL 62: https://math.berkeley.edu/undergraduate/undergraduate-research-opportunities (Type: Advi