<a href="https://colab.research.google.com/github/ishan654321/News-Article-Text-Analysis/blob/main/News_Article_Text_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install requests beautifulsoup4 textblob nltk textstat pandas
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.16.0-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.16.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.16.0 textstat-0.7.4


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import textstat
import re

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def scrape_article_text(url):
    """
    Scrape article body text from a given URL. Assumes article text is within 'data-component'='text-block'.
    """
    try:
        response = requests.get(url)
        if response.status_code != 200:
            return None
        soup = BeautifulSoup(response.text, 'html.parser')

        paragraphs = soup.find_all('div', {'data-component': 'text-block'})
        article_text = " ".join([para.get_text() for para in paragraphs])
        return article_text
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

def text_analysis(article_text):
    """
    Perform text analysis and calculate various metrics for the article.
    """
    if not article_text:
        return None

    # Tokenize sentences and words
    sentences = sent_tokenize(article_text)
    words = word_tokenize(article_text)

    # Remove stopwords and non-alphabetic words
    filtered_words = [word for word in words if word.isalpha() and word.lower() not in stop_words]

    # Calculate word count
    total_words = len(filtered_words)

    # Calculate sentence statistics
    avg_words_per_sentence = total_words / len(sentences) if sentences else 0

    # Sentiment analysis (using TextBlob)
    blob = TextBlob(article_text)
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity

    # Calculate complex words (words with 3 or more syllables)
    complex_words = [word for word in filtered_words if textstat.syllable_count(word) >= 3]
    complex_word_count = len(complex_words)
    avg_complex_words_per_sentence = complex_word_count / len(sentences) if sentences else 0

    # Calculate readability metrics
    fog_index = textstat.gunning_fog(article_text)

    return {
        'total_words': total_words,
        'avg_words_per_sentence': avg_words_per_sentence,
        'polarity': polarity,
        'subjectivity': subjectivity,
        'fog_index': fog_index,
        'complex_word_count': complex_word_count,
        'avg_complex_words_per_sentence': avg_complex_words_per_sentence
    }

def process_articles(csv_file):
    # Load URLs from CSV
    urls_df = pd.read_csv(csv_file)

    # DataFrame to store analysis results
    results = []

    for index, row in urls_df.iterrows():
        url = row['url']
        article_text = scrape_article_text(url)
        if article_text:
            analysis = text_analysis(article_text)
            if analysis:
                analysis['url'] = url
                results.append(analysis)

    # Create DataFrame from results and save it to CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv('article_analysis_results.csv', index=False)
    print("Analysis completed and saved to 'article_analysis_results.csv'")


In [None]:
# Path to your CSV file containing URLs
csv_file = '/content/news_articles.csv'

process_articles(csv_file)

Analysis completed and saved to 'article_analysis_results.csv'
