In [None]:
import csv
import requests
from bs4 import BeautifulSoup
import os

def extract_header_and_div_text_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises an HTTPError if the response status code is 4XX/5XX
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extracting text from the <header> tag
        header = soup.find('h1')
        header_text = header.get_text(strip=True) if header else "No header found"

        # Extracting text from a <div> with a specific class
        div = soup.find('div', class_='td-post-content tagdiv-type')  # Replace 'article-text-class' with the actual class name
        div_text = div.get_text(strip=True) if div else "No text found in specified div"

        return header_text, div_text
    except requests.exceptions.RequestException as e:
        print(f"Request error: {e}")
        return "", ""
    except Exception as e:
        print(f"Error extracting text: {e}")
        return "", ""

csv_file_path = 'Input Sheet1.csv'
output_folder_path = 'ExtractedTexts'  # Folder where individual files will be saved

# Ensure the output folder exists
os.makedirs(output_folder_path, exist_ok=True)

with open(csv_file_path, newline='', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        article_id, url = row
        header_text, div_text = extract_header_and_div_text_from_url(url)

        # Creating a unique filename for each article
        filename = f"{output_folder_path}/{article_id}.txt"

        with open(filename, 'w', encoding='utf-8') as outputfile:
            outputfile.write(f"URL: {url}\n")
            outputfile.write(f"Header: {header_text}\n")
            outputfile.write(f"Div Text: {div_text}\n")

        print(f"Saved: {filename}")

In [None]:
from google.colab import files
import shutil
shutil.make_archive("cleaned files", 'zip', "cleaned files")
files.download("cleaned files")

36 and 48 not found

In [None]:
import os

# Paths to the directories
source_folder = '/content/ExtractedTexts'
stopwords_folder = '/content/stopwords'
cleaned_folder = '/content/cleaned files'

# Ensure the cleaned files folder exists
if not os.path.exists(cleaned_folder):
    os.makedirs(cleaned_folder)

# Load stopwords
stopwords = set()
for filename in os.listdir(stopwords_folder):
    # Check if it's a file, not a directory
    if os.path.isfile(os.path.join(stopwords_folder, filename)):
        # Try to detect the encoding, fallback to 'latin-1' if detection fails
        try:
            with open(os.path.join(stopwords_folder, filename), 'r', encoding='utf-8') as file:
                for line in file:
                    stopwords.add(line.strip().lower())
        except UnicodeDecodeError:
            with open(os.path.join(stopwords_folder, filename), 'r', encoding='latin-1') as file:
                for line in file:
                    stopwords.add(line.strip().lower())

# Function to clean a single file
def clean_file(file_path, stopwords):
    with open(file_path, 'r') as file:
        content = file.read()
    words = content.split()
    cleaned_words = [word for word in words if word.lower() not in stopwords]
    return ' '.join(cleaned_words)

# Clean each file in the source folder and save in the cleaned folder
for filename in os.listdir(source_folder):
    if filename.endswith('.txt'):
        file_path = os.path.join(source_folder, filename)
        cleaned_content = clean_file(file_path, stopwords)
        # Construct new file path in the cleaned folder
        new_file_path = os.path.join(cleaned_folder, filename)
        # Save the cleaned content to the new file
        with open(new_file_path, 'w') as file:
            file.write(cleaned_content)

In [None]:
import os
import csv

# Paths to the directories
cleaned_folder = '/content/cleaned files'
positive_words_file = '/content/positive-words.txt'
negative_words_file = '/content/negative-words.txt'
results_file = '/content/sentiment-results.csv'

# Load positive and negative words
positive_words = set()
negative_words = set()

with open(positive_words_file, 'r') as file:
    for line in file:
        positive_words.add(line.strip().lower())

# Read negative words, trying UTF-8 first, then falling back to latin-1
try:
    with open(negative_words_file, 'r', encoding='utf-8') as file:
        for line in file:
            negative_words.add(line.strip().lower())
except UnicodeDecodeError:
    with open(negative_words_file, 'r', encoding='latin-1') as file:
        for line in file:
            negative_words.add(line.strip().lower())

# Function to analyze sentiment of a single file
def analyze_sentiment(file_path, positive_words, negative_words):
    with open(file_path, 'r') as file:
        content = file.read()
    words = content.split()
    positive_score = sum(1 for word in words if word.lower() in positive_words)
    negative_score = sum(1 for word in words if word.lower() in negative_words)
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    # Calculate Subjectivity Score
    subjectivity_score = (positive_score + negative_score) / (len(words) + 0.000001)
    return positive_score, negative_score, polarity_score, subjectivity_score

# Open the results file and write the headers
with open(results_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['File Name', 'Positive Score', 'Negative Score', 'Polarity Score', 'Subjectivity Score'])

    # Analyze sentiment for each cleaned file and write the results
    for filename in os.listdir(cleaned_folder):
        if filename.endswith('.txt'):
            file_path = os.path.join(cleaned_folder, filename)
            positive_score, negative_score, polarity_score, subjectivity_score = analyze_sentiment(file_path, positive_words, negative_words)
            writer.writerow([filename, positive_score, negative_score, polarity_score, subjectivity_score])

print(f"Sentiment analysis results saved to {results_file}")

In [None]:
pip install syllables

In [None]:
import os
import csv
import syllables
import re

# Paths to the directories
cleaned_folder = '/content/cleaned files'
results_file = '/content/readability-results.csv'

# Function to calculate readability metrics for a single file
def analyze_readability(file_path):
    with open(file_path, 'r') as file:
        content = file.read()

    words = content.split()
    sentences = content.split('.')  # Simple sentence splitting, might need refinement
    num_words = len(words)
    num_sentences = len(sentences)

    # Calculate Average Sentence Length
    average_sentence_length = num_words / num_sentences if num_sentences > 0 else 0

    # Calculate Percentage of Complex Words and count complex words
    num_complex_words = 0
    total_syllables = 0
    total_characters = 0
    for word in words:
      syllable_count = syllables.estimate(word)
      total_syllables += syllable_count
      total_characters += len(word)
      if syllable_count > 2:
        num_complex_words += 1
    percentage_complex_words = (num_complex_words / num_words) * 100 if num_words > 0 else 0

    # Calculate Fog Index
    fog_index = 0.4 * (average_sentence_length + percentage_complex_words)

    # Calculate Average Word Length
    average_word_length = total_characters / num_words if num_words > 0 else 0

    # Calculate Personal Pronouns using regex
    personal_pronoun_pattern = r"\b(I|me|my|mine|myself|we|us|our|ours|ourselves|you|your|yours|yourself|yourselves|he|him|his|himself|she|her|hers|herself|it|its|itself|they|them|their|theirs|themselves)\b"
    personal_pronouns = re.findall(personal_pronoun_pattern, content, flags=re.IGNORECASE)
    num_personal_pronouns = len(personal_pronouns)

    return average_sentence_length, percentage_complex_words, fog_index, num_complex_words, num_words, total_syllables, average_word_length, num_personal_pronouns

# Open the results file and write the headers
with open(results_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['File Name', 'Average Sentence Length', 'Percentage of Complex Words', 'Fog Index', 'Complex Word Count', 'Word Count', 'Syllable Count', 'Average Word Length', 'Personal Pronouns'])

    # Analyze readability for each cleaned file and write the results
    for filename in os.listdir(cleaned_folder):
        if filename.endswith('.txt'):
            file_path = os.path.join(cleaned_folder, filename)
            avg_sentence_length, pct_complex_words, fog_index, num_complex_words, num_words, total_syllables, avg_word_length, num_personal_pronouns = analyze_readability(file_path)
            writer.writerow([filename, avg_sentence_length, pct_complex_words, fog_index, num_complex_words, num_words, total_syllables, avg_word_length, num_personal_pronouns])
            print(f"For file {filename}: Complex words: {num_complex_words}, Word count: {num_words}, Syllable count: {total_syllables}, Average word length: {avg_word_length:.2f}, Personal pronouns: {num_personal_pronouns}")

print(f"Readability analysis results saved to {results_file}")

importing all required libraries