# Automated Pre-processing Text Article, and saving the score in a txt file for each article

In [67]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('sentiwordnet')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn

from nltk.sentiment.vader import SentimentIntensityAnalyzer

import stanza
stanza.download('en')  # Download the English model

from readability import Readability

import spacy
nlp_spacy = spacy.load("en_core_web_sm")

import statistics
import numpy as np
import pandas as pd

import re

import requests
from bs4 import BeautifulSoup

import newspaper

import torch
torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)

import os

[nltk_data] Downloading package punkt to /home/pierluigi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-04-29 17:34:46 INFO: Downloading default packages for language: en (English) ...
2023-04-29 17:34:48 INFO: File exists: /home/pierluigi/stanza_resources/en/default.zip
2023-04-29 17:34:53 INFO: Finished downloading models and saved to /home/pierluigi/stanza_resources.


In [68]:
# Setting the use_gpu=False, it uses the CPU instead of the GPU for calculating stuff, and also for printing the results. And it couldn't run out of memory.
nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment', tokenize_no_ssplit=False, max_split_size_mb=15, use_gpu=False)

2023-04-29 17:34:54 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-04-29 17:34:54 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| sentiment | sstplus  |

2023-04-29 17:34:54 INFO: Using device: cpu
2023-04-29 17:34:54 INFO: Loading: tokenize
2023-04-29 17:34:54 INFO: Loading: sentiment
2023-04-29 17:34:55 INFO: Done loading processors!


In [69]:
# Load the MPQA lexicon
lexicon = pd.read_csv("/home/pierluigi/Documents/echo_chambers_intership/Code analysis/NLP/Single modules/subjclueslen1-HLTEMNLP05.tff", sep=" ", header=None, 
                      names=["type", "len", "word", "pos", "stemmed", "polarity", "strength"])

lexicon["type"] = lexicon["type"].str[5:]
lexicon["word"] = lexicon["word"].str[len("word1="):]
lexicon["polarity"] = lexicon["polarity"].str[len("priorpolarity="):]
cols_to_remove = ["len", "pos", "stemmed", "strength"]
lexicon = lexicon.drop(columns=cols_to_remove)
lexicon["type"] = lexicon["type"].replace("weaksubj", 1)
lexicon["type"] = lexicon["type"].replace("strongsubj", 2)
lexicon["polarity"] = lexicon["polarity"].replace("negative", -1)
lexicon["polarity"] = lexicon["polarity"].replace("positive", 1)
lexicon["polarity"] = lexicon["polarity"].replace("both", 0)
lexicon["polarity"] = lexicon["polarity"].replace("neutral", 0)

In [82]:
def preprocess_article(url):
    # Create a newspaper Article object
    article = newspaper.Article(url)

    # Download and parse the article
    article.download()
    article.parse()

    # Extract the title, subtitle, description, and main text
    title = article.title.strip()
    subtitle = article.meta_data.get("description", "").strip()
    description = article.meta_description.strip()
    text = article.text.strip()

    # Set the subtitle to the description if it is empty
    if not subtitle:
        subtitle = description.strip()

    # Concatenate the extracted strings
    article_text = f"{title}\n\n{subtitle}\n\n{text}"
    # Tokenize the text into sentences
    sentences = sent_tokenize(article_text)
    
    # Identify the stop words for each sentence
    num_stop_words_per_sentence = []
    stop_words_per_sentence = []
    filtered_sentences = []
    num_words_per_sentence = []
    avg_stop_words_per_sentence = []
    total_words = 0

    # Create a Porter stemmer object
    stemmer = PorterStemmer()
    stemmed_sentences = []

    # Process the text with the pipeline and extract the sentiment for each sentence
    doc = nlp(text)
    s_sentiment_scores = []

    # initialize the Vader sentiment analyzer
    analyzer = SentimentIntensityAnalyzer()
    v_scores_list = []

    # MPQA analysis
    mpqa_scores = []
    
    for i, sentence in enumerate(sentences):
        # Tokenize the sentence into words
        words = word_tokenize(sentence)
        all_words = len(words)
        total_words += all_words
        
        # Identify the stop words in the sentence
        stop_words = set(stopwords.words('english'))
        stop_words_found = [word for word in words if word.lower() in stop_words]
        all_stop_words = len(stop_words_found)
        filtered_words = [word for word in words if word.lower() not in stop_words]
        
        # Add the number of stop words and filtered sentence to the output
        num_stop_words = all_words - len(filtered_words)
        num_stop_words_per_sentence.append(num_stop_words)
        stop_words_per_sentence.append(stop_words_found)
        filtered_sentences.append(" ".join(filtered_words))
        num_words_per_sentence.append(all_words)
        
        # Calculate the average number of stop words per sentence
        avg_stop_words_per_sentence.append(num_stop_words / all_words)

        # Perform stemming on each word using the Porter stemmer
        stemmed_words = [stemmer.stem(word) for word in words]

        # Combine the stemmed words back into a single string
        stemmed_sentence = ' '.join(stemmed_words)
        stemmed_sentences.append(stemmed_sentence)

        v_scores = analyzer.polarity_scores(sentence)
        v_score_list = [v_scores['neg'], v_scores['neu'], v_scores['pos']]
        v_scores_list.append(v_score_list)

    # Sentiment analysis using Stanza library
    for sentence in doc.sentences:
        s_sentiment_scores.append(sentence.sentiment)
    
    # Sentiwordnet scores
    sentiwordnet_final_score = 0
    
    # Loop through each word in the text
    sentiment_score = 0
    num_synsets = 0

    for word in article_text.split():
        word = word.strip().lower()
        if word in lexicon.word.tolist():
            polarity = lexicon[lexicon.word == word].polarity.values[0]
            mpqa_scores.append(polarity)
        
        synsets = wn.synsets(word)
        if len(synsets) > 0:
            synset = synsets[0]
            senti_synset = swn.senti_synset(synset.name())
            sentiment_score += senti_synset.pos_score() - senti_synset.neg_score()
            num_synsets += 1

    # Calculate summary statistics
    num_stop_words = sum(num_stop_words_per_sentence)
    num_sentences = len(sentences)
    avg_stop_words_per_sentence_all = num_stop_words / num_sentences
    max_stop_words_per_sentence = max(num_stop_words_per_sentence)
    min_stop_words_per_sentence = min(num_stop_words_per_sentence)
    avg_stop_words_per_word = num_stop_words / total_words
    
    # Calculate the average number of stop words per article
    avg_stop_words_per_sentence_avg = sum(avg_stop_words_per_sentence) / len(avg_stop_words_per_sentence)
    
    # Vader scores
    v_scores_array = np.array(v_scores_list)
    v_avg_scores = np.mean(v_scores_array, axis=0)
    v_max_scores = np.max(v_scores_array, axis=0)
    v_min_scores = np.min(v_scores_array, axis=0)
    v_std_scores = np.std(v_scores_array, axis=0)

    # MPQA scores
    mpqa_avg_score = np.mean(mpqa_scores)
    mpqa_max_score = np.max(mpqa_scores)
    mpqa_min_score = np.min(mpqa_scores)
    mpqa_sd_score = np.std(mpqa_scores)

    # Calculate final score        
    if num_synsets > 0:
        sentiwordnet_final_score = sentiment_score / num_synsets
    else:
        sentiwordnet_final_score = 0

    # Determine the readability of the text article
    read = Readability(article_text)

    # Flesch Kincaid Grade Level
    flesch_kincaid = read.flesch_kincaid()

    # Flesch Reading Ease
    flesch_reading = read.flesch()

    # Dale Chall Readability
    dale_chall = read.dale_chall()

    # Automated Readability Index (ARI)
    ari = read.ari()

    # Coleman Liau Index
    coleman_liau = read.coleman_liau()

    # Gunning Fog
    gunning_fog = read.gunning_fog()

    # SMOG: at least 30 sentences required. Uncomment if needed.
    #smog = read.smog()

    # SPACHE
    spache = read.spache()

    # Linsear Write
    linsear_write = read.linsear_write()


    # Return the output
    return {
        'title': title,
        'num_stop_words': num_stop_words,
        'total_words': total_words,
        'stop_words_found': stop_words_found,
        'all_stop_words': all_stop_words,
        'avg_stop_words_per_sentence_all': avg_stop_words_per_sentence_all,
        'max_stop_words_per_sentence': max_stop_words_per_sentence,
        'min_stop_words_per_sentence': min_stop_words_per_sentence,
        'avg_stop_words_per_word': avg_stop_words_per_word,
        'avg_stop_words_per_sentence': avg_stop_words_per_sentence,
        'avg_stop_words_per_sentence_avg': avg_stop_words_per_sentence_avg,
        'filtered_sentences': filtered_sentences,
        'stop_words_per_sentence': stop_words_per_sentence,
        'num_words_per_sentence': num_words_per_sentence,
        'num_stop_words_per_sentence': num_stop_words_per_sentence,
        's_sentiment_scores': s_sentiment_scores,
        'v_avg_scores': v_avg_scores,
        'v_max_scores': v_max_scores,
        'v_min_scores': v_min_scores,
        'v_std_scores': v_std_scores,
        'mpqa_avg_score': mpqa_avg_score,
        'mpqa_max_score': mpqa_max_score,
        'mpqa_min_score': mpqa_min_score,
        'mpqa_sd_score': mpqa_sd_score,
        'sentiwordnet_final_score': sentiwordnet_final_score,
        'flesch_kincaid_score': flesch_kincaid.score,
        'flesch_kincaid_grade_level': flesch_kincaid.grade_level,
        'flesch_reading_score': flesch_reading.score,
        'flesch_reading_ease': flesch_reading.ease,
        'dale_chall_score': dale_chall.score,
        'dale_chall_grade_levels': dale_chall.grade_levels,
        'ari_score': ari.score,
        'ari_grade_level': ari.grade_levels,
        'ari_ages': ari.ages,
        'coleman_liau_score': coleman_liau.score,
        'coleman_liau_grade_level': coleman_liau.grade_level,
        'gunning_fog_score': gunning_fog.score,
        'gunning_fog_grade_level': gunning_fog.grade_level,
        #'smog_score': smog.score,
        #'smog_grade_level': smog.grade_level,
        'spache_score': spache.score,
        'spache_grade_level': spache.grade_level,
        'linsear_write_score': linsear_write.score,
        'linsear_write_grade_level': linsear_write.grade_level
    }


In [87]:
def preprocess_articles(urls, directory):
    # Create the output directory if it doesn't exist
    os.makedirs(directory, exist_ok=True)
    
    for url in urls:
        results = preprocess_article(url)
        # Write preprocessed article to a separate file for each URL
        file_path = f'{directory}/{results["title"]}.txt'
        
        with open(file_path, 'w') as f:
            # Save the information for each sentence to the file
            for i, sentence in enumerate(results['filtered_sentences']):
                f.write(f"Sentence {i+1}: {sentence}\n")
                f.write(f"Total words: {results['num_words_per_sentence'][i]}\n")
                f.write(f"Filtered words: {sentence.split()}\n")
                f.write(f"Number of filtered words: {len(sentence.split())}\n")
                f.write(f"Stop words: {results['stop_words_per_sentence'][i]}\n")
                f.write(f"Number of stop words: {results['num_stop_words_per_sentence'][i]}\n")
                f.write(f"Average number of stop words per sentence: {round(results['avg_stop_words_per_sentence'][i], 2)}\n")
                f.write(f"Sentiment score: {results['s_sentiment_scores'][i]}\n\n")

            # Save the general statistics on stop words to the file
            f.write(f"Total number of words: {results['total_words']}\n")
            f.write(f"Total number of stop words: {results['num_stop_words']}\n")
            f.write(f"Maximum number of stop words per sentence: {results['max_stop_words_per_sentence']}\n")
            f.write(f"Minimum number of stop words per sentence: {results['min_stop_words_per_sentence']}\n")
            f.write(f"Average number of stop words per article: {round(results['avg_stop_words_per_sentence_avg'], 2)}\n\n")
            
            # Stanza sentiment scores
            f.write(f"Stanza Average of sentiment score for all sentences: {sum(results['s_sentiment_scores']) / len(results['s_sentiment_scores'])}\n")
            f.write(f"Stanza Maximum sentiment score: {max(results['s_sentiment_scores'])}\n")
            f.write(f"Stanza Minimum sentiment score: {min(results['s_sentiment_scores'])}\n")
            f.write(f"Stanza Standard deviation: {statistics.stdev(results['s_sentiment_scores'])}\n\n")

            # Vader sentiment scores
            f.write(f"Vader average scores: {results['v_avg_scores']}\n")
            f.write(f"Vader maximum scores: {results['v_max_scores']}\n")
            f.write(f"Vader minimum scores: {results['v_min_scores']}\n")
            f.write(f"Vader standard deviation scores: {results['v_std_scores']}\n\n")

            # MPQA sentiment scores
            f.write(f"MPQA average scores: {results['mpqa_avg_score']}\n")
            f.write(f"MPQA maximum scores: {results['mpqa_max_score']}\n")
            f.write(f"MPQA minimum scores: {results['mpqa_min_score']}\n")
            f.write(f"MPQA standard deviation scores: {results['mpqa_sd_score']}\n\n")

            # Sentiword sentiment scores
            f.write(f"Sentiwordnet score: {results['sentiwordnet_final_score']} (from -1 to 1, and score of 0 indicates a neutral sentiment.)\n\n")
            
            # Flesch_Kincaid scores
            f.write(f"Flesch-Kincaid score: {results['flesch_kincaid_score']}\n")
            f.write(f"The estimated reading level of the article is: {results['flesch_kincaid_grade_level']}\n\n") 

            # Flesch Reading ease scores
            f.write(f"Flesch Reading Ease score: {results['flesch_reading_score']}\n")
            f.write(f"The article is classified as: {results['flesch_reading_ease']}\n\n")

            # Print the Dale-Chall scores
            f.write(f"Dale-Chall Readability score: {results['dale_chall_score']}\n")
            # Print the estimated grade levels for comprehension
            f.write(f"The estimated comprehension level for different grade levels is: {results['dale_chall_grade_levels']}\n\n")

            # Print the ARI scores
            f.write(f"Automated Readability Index (ARI) score: {results['ari_score']}, which corresponds to a grade level of {results['ari_grade_level']}.\n")
            f.write(f"This means that the text can be read by someone who is around {results['ari_ages']} years old.\n\n")

            # Print the Coleman-Liau scores
            f.write(f"Coleman-Liau Index Score: {results['coleman_liau_score']}\n")
            f.write(f"Estimated Grade Level: {results['coleman_liau_grade_level']}\n\n")

            # Print the Gunning Fog scores
            f.write(f"Gunning Fog score: {results['gunning_fog_score']}\n")
            f.write(f"The estimated grade level for comprehension is: {results['gunning_fog_grade_level']}\n\n")

            # Print the SMOG scores
            #f.write(f"SMOG score: {results['smog_score']}. This corresponds to a grade level of {results['smog_grade_level']}.")
            
            # Print the SPACHE scores
            f.write(f"SPACHE score: {results['spache_score']}\n")
            f.write(f"This corresponds to a grade level of {results['spache_grade_level']}.\n\n")

            # Print the Linsear Write Index scores
            f.write(f"Linsear Write Index score: {results['linsear_write_score']}\n")
            f.write("Approximate grade level equivalent: {}".format(results['linsear_write_grade_level']))
            print("\n\n")



In [88]:
urls = ['https://www.foxnews.com/politics/republicans-respond-after-irs-whistleblower-says-hunter-biden-investigation-being-mishandled',
        'https://news.yahoo.com/alabama-education-director-ousted-over-234450832.html',
        'https://news.yahoo.com/samantha-cameron-remind-david-steer-050000235.html']

preprocess_articles(urls, directory='preprocessed articles')










