[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Analysis saved to /content/Output Data Structure.xlsx


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [4]:
!pip install textstat
import pandas as pd
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize
from textblob import TextBlob
import textstat
import re
import nltk
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

def load_words(file_path, encoding='ISO-8859-1'):
    """Load words from a file into a set."""
    try:
        with open(file_path, 'r', encoding=encoding) as file:
            words = {line.strip() for line in file if line.strip()}
        return words
    except Exception as e:
        print(f"Error loading words from {file_path}: {e}")
        return set()

# Load positive and negative words
positive_words = load_words('/content/positive-words.txt')
negative_words = load_words('/content/negative-words.txt')

def extract_article(url):
    """Extracts article title and content from a given URL."""
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')

        title = soup.find('h1').get_text(strip=True) if soup.find('h1') else "No Title Found"
        content_div = soup.find('div', class_='article-body') or soup.find('main') or soup.body
        content = content_div.get_text(strip=True) if content_div else "No Content Found"

        return title, content
    except Exception as e:
        print(f"Error extracting {url}: {e}")
        return "Error", "Error"

def analyze_text(text):
    """Analyze the text using Python libraries."""
    metrics = {}

    try:
        # Word and sentence tokenization
        words = word_tokenize(text.lower())
        sentences = sent_tokenize(text)

        word_count = len(words)
        sentence_count = len(sentences)
        avg_sentence_length = word_count / sentence_count if sentence_count else 0

        # Polarity and subjectivity using TextBlob
        blob = TextBlob(text)
        polarity = blob.sentiment.polarity
        subjectivity = blob.sentiment.subjectivity

        # Complex words and readability
        complex_word_count = sum([1 for word in words if textstat.syllable_count(word) >= 3])
        percentage_complex_words = (complex_word_count / word_count * 100) if word_count else 0
        fog_index = textstat.gunning_fog(text)
        syllable_per_word = textstat.syllable_count(text) / word_count if word_count else 0
        avg_word_length = sum(len(word) for word in words) / word_count if word_count else 0

        # Positive and negative word counts
        positive_score = sum(1 for word in words if word in positive_words)
        negative_score = sum(1 for word in words if word in negative_words)

        # Personal pronouns
        personal_pronouns = len(re.findall(r'\b(I|we|me|us|my|ours|mine|our)\b', text, re.IGNORECASE))

        # Metrics calculation
        metrics["Polarity Score"] = polarity
        metrics["Subjectivity Score"] = subjectivity
        metrics["Average Sentence Length"] = avg_sentence_length
        metrics["Percentage of Complex Words"] = percentage_complex_words
        metrics["Fog Index"] = fog_index
        metrics["Word Count"] = word_count
        metrics["Complex Word Count"] = complex_word_count
        metrics["Syllable Per Word"] = syllable_per_word
        metrics["Average Word Length"] = avg_word_length
        metrics["Positive Score"] = positive_score
        metrics["Negative Score"] = negative_score
        metrics["Personal Pronouns"] = personal_pronouns
    except Exception as e:
        print(f"Error analyzing text: {e}")
        metrics = {key: "Error" for key in [
            "Polarity Score", "Subjectivity Score", "Average Sentence Length", "Percentage of Complex Words",
            "Fog Index", "Word Count", "Complex Word Count", "Syllable Per Word", "Average Word Length",
            "Positive Score", "Negative Score", "Personal Pronouns"
        ]}
    return metrics

def main():
    # File paths
    input_file = "/content/Input.xlsx"  # Replace with your actual input file path
    output_file = "/content/Generated_Output.xlsx"  # Path for the new file to be created

    # Read input data
    try:
        input_data = pd.read_excel(input_file)
    except Exception as e:
        print(f"Error reading input file: {e}")
        return

    results = []

    for _, row in input_data.iterrows():
        url_id = row.get('URL_ID')
        url = row.get('URL')

        if not url or pd.isna(url):
            print(f"Invalid URL for URL_ID {url_id}")
            continue

        # Extract article content
        title, content = extract_article(url)

        if content == "Error":
            print(f"Skipping URL_ID {url_id} due to extraction issues.")
            continue

        # Analyze the content
        metrics = analyze_text(content)
        metrics['URL_ID'] = url_id
        metrics['Title'] = title  # Add the extracted title for context
        results.append(metrics)

    # Convert results to a DataFrame
    result_df = pd.DataFrame(results)

    # Save the results to a new Excel file
    try:
        result_df.to_excel(output_file, index=False)
        print(f"Analysis saved to {output_file}")
    except Exception as e:
        print(f"Error saving results: {e}")
        return

    # Provide download link in Colab
    from google.colab import files
    files.download(output_file)

if __name__ == "__main__":
    main()




[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Analysis saved to /content/Generated_Output.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>