# Automated Pre-processing Text Article, and saving the score in a txt file for each article

In [99]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('sentiwordnet')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.tag import pos_tag

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer

import stanza
stanza.download('en')  # Download the English model

from readability import Readability

import spacy
nlp_spacy = spacy.load("en_core_web_sm")

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel

import pyLDAvis.gensim
import pyLDAvis.gensim_models as gensimvis
from gensim.models import Phrases
from gensim.models.phrases import Phraser

import statistics
import numpy as np
import pandas as pd

import csv
from tabulate import tabulate
import newspaper

import torch
torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)

import os

[nltk_data] Downloading package punkt to /home/pierluigi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-05-03 17:21:50 INFO: Downloading default packages for language: en (English) ...
2023-05-03 17:21:52 INFO: File exists: /home/pierluigi/stanza_resources/en/default.zip
2023-05-03 17:21:58 INFO: Finished downloading models and saved to /home/pierluigi/stanza_resources.


In [100]:
# Setting the use_gpu=False, it uses the CPU instead of the GPU for calculating stuff, and also for printing the results. And it couldn't run out of memory.
nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment', tokenize_no_ssplit=False, max_split_size_mb=15, use_gpu=False)

2023-05-03 17:21:58 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-05-03 17:21:59 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| sentiment | sstplus  |

2023-05-03 17:21:59 INFO: Using device: cpu
2023-05-03 17:21:59 INFO: Loading: tokenize
2023-05-03 17:21:59 INFO: Loading: sentiment
2023-05-03 17:22:00 INFO: Done loading processors!


In [101]:
# Load the MPQA lexicon
lexicon = pd.read_csv("/home/pierluigi/Documents/echo_chambers_intership/Code analysis/NLP/Single modules/subjclueslen1-HLTEMNLP05.tff", sep=" ", header=None, 
                      names=["type", "len", "word", "pos", "stemmed", "polarity", "strength"])

lexicon["type"] = lexicon["type"].str[5:]
lexicon["word"] = lexicon["word"].str[len("word1="):]
lexicon["polarity"] = lexicon["polarity"].str[len("priorpolarity="):]
cols_to_remove = ["len", "pos", "stemmed", "strength"]
lexicon = lexicon.drop(columns=cols_to_remove)
lexicon["type"] = lexicon["type"].replace("weaksubj", 1)
lexicon["type"] = lexicon["type"].replace("strongsubj", 2)
lexicon["polarity"] = lexicon["polarity"].replace("negative", -1)
lexicon["polarity"] = lexicon["polarity"].replace("positive", 1)
lexicon["polarity"] = lexicon["polarity"].replace("both", 0)
lexicon["polarity"] = lexicon["polarity"].replace("neutral", 0)

In [102]:
def get_article_info(url):
    # Create a newspaper Article object
    article = newspaper.Article(url)

    # Download and parse the article
    article.download()
    article.parse()

    # Extract the title, subtitle, description, and main text
    title = article.title.strip()
    subtitle = article.meta_data.get("description", "").strip()
    description = article.meta_description.strip()
    text = article.text.strip()

    # Set the subtitle to the description if it is empty
    if not subtitle:
        subtitle = description.strip()

    # Concatenate the extracted strings
    article_text = f"{title}\n\n{subtitle}\n\n{text}"

    # Return the concatenated string
    return article_text, title, text

In [103]:
def preprocess_text(article):
    # Tokenize the text into sentences
    sentences = sent_tokenize(article)

    num_stop_words_per_sentence = []
    stop_words_per_sentence = []
    filtered_sentences = []
    filtered_words = []
    num_words_per_sentence = []
    avg_stop_words_per_sentence = []
    total_words = 0
    total_adjectives = 0

    for i, sentence in enumerate(sentences):
        # Tokenize the sentence into words
        words = word_tokenize(sentence)
        all_words = len(words)
        total_words += all_words
        
        # Identify the stop words in the sentence
        stop_words = set(stopwords.words('english'))
        stop_words_found = [word for word in words if word.lower() in stop_words]
        filtered_words = [word for word in words if word.lower() not in stop_words]
        
        # Add the number of stop words and filtered sentence to the output
        num_stop_words = all_words - len(filtered_words)
        num_stop_words_per_sentence.append(num_stop_words)
        stop_words_per_sentence.append(stop_words_found)
        filtered_sentences.append(" ".join(filtered_words))
        num_words_per_sentence.append(all_words)
        
        # Calculate the average number of stop words per sentence
        avg_stop_words_per_sentence.append(num_stop_words / all_words)

        #POS tagging calculations
        tagged_words = pos_tag(words)
        num_adjectives = len([word for word, tag in tagged_words if tag.startswith('JJ')])
        total_adjectives += num_adjectives

    # Calculate summary statistics
    num_stop_words = sum(num_stop_words_per_sentence)
    max_stop_words_per_sentence = max(num_stop_words_per_sentence)
    min_stop_words_per_sentence = min(num_stop_words_per_sentence)
    
    # Calculate the average number of stop words per article
    avg_stop_words_per_sentence_avg = sum(avg_stop_words_per_sentence) / len(avg_stop_words_per_sentence)
    
    # POS tagging 
    avg_adjectives = total_adjectives / total_words

    return sentences, filtered_words, filtered_sentences, stop_words_per_sentence, num_stop_words_per_sentence, avg_stop_words_per_sentence, total_words, num_stop_words, max_stop_words_per_sentence, min_stop_words_per_sentence, avg_stop_words_per_sentence_avg, num_words_per_sentence, total_adjectives, avg_adjectives

In [104]:
def stanza_sentiment_analysis(text):
    doc = nlp(text)
    s_sentiment_scores = []

    # Sentiment analysis using Stanza library
    for sentence in doc.sentences:
        s_sentiment_scores.append(sentence.sentiment)
    
    return s_sentiment_scores

In [105]:
def vader_sentiment_analysis(sentences):
    # initialize the Vader sentiment analyzer
    analyzer = SentimentIntensityAnalyzer()
    v_scores_list = []

    for i, sentence in enumerate(sentences):
        v_scores = analyzer.polarity_scores(sentence)
        v_score_list = [v_scores['neg'], v_scores['neu'], v_scores['pos']]
        v_scores_list.append(v_score_list)
    
    # Vader scores
    v_scores_array = np.array(v_scores_list)
    v_avg_scores = np.mean(v_scores_array, axis=0)
    v_max_scores = np.max(v_scores_array, axis=0)
    v_min_scores = np.min(v_scores_array, axis=0)
    v_std_scores = np.std(v_scores_array, axis=0)

    return v_avg_scores, v_max_scores, v_min_scores, v_std_scores


In [106]:
def mpqa_sentiment_analysis(article):
    mpqa_scores = []

    for word in article.split():
        word = word.strip().lower()
        if word in lexicon.word.tolist():
            polarity = lexicon[lexicon.word == word].polarity.values[0]
            mpqa_scores.append(polarity)
        
    # MPQA scores
    mpqa_avg_score = np.mean(mpqa_scores)
    mpqa_max_score = np.max(mpqa_scores)
    mpqa_min_score = np.min(mpqa_scores)
    mpqa_sd_score = np.std(mpqa_scores)

    return mpqa_avg_score, mpqa_max_score, mpqa_min_score, mpqa_sd_score

In [107]:
def sentiwordnet_sentiment_analysis(article):
    sentiwordnet_final_score = 0

    # Loop through each word in the text
    sentiment_score = 0
    num_synsets = 0

    for word in article.split():
        synsets = wn.synsets(word)
        if len(synsets) > 0:
            synset = synsets[0]
            senti_synset = swn.senti_synset(synset.name())
            sentiment_score += senti_synset.pos_score() - senti_synset.neg_score()
            num_synsets += 1
    
    # Calculate final score        
    if num_synsets > 0:
        sentiwordnet_final_score = sentiment_score / num_synsets
    else:
        sentiwordnet_final_score = 0
    
    return sentiwordnet_final_score


In [108]:
def readability_analysis(article):
    read = Readability(article)
    metrics = {}

    # Flesch Kincaid
    metrics['flesch_kincaid'] = read.flesch_kincaid()

    # Flesch Reading Ease
    metrics['flesch_reading'] = read.flesch()

    # Dale Chall Readability
    metrics['dale_chall'] = read.dale_chall()

    # Automated Readability Index (ARI)
    metrics['ari'] = read.ari()

    # Coleman Liau Index
    metrics['coleman_liau'] = read.coleman_liau()

    # Gunning Fog
    metrics['gunning_fog'] = read.gunning_fog()

    # SMOG: at least 30 sentences required. Uncomment if needed.
    # metrics['smog'] = read.smog()

    # SPACHE
    metrics['spache'] = read.spache()

    # Linsear Write
    metrics['linsear_write'] = read.linsear_write()

    return metrics


In [109]:
def gensim_lda_algorithm(filtered_words):
    # Gensim-LDA analysis
    bigrams = list(nltk.bigrams(filtered_words))
    lemmatized_bigrams = []

    lemmatizer = WordNetLemmatizer()

    for bigram in bigrams:
        lemma1 = lemmatizer.lemmatize(bigram[0])
        lemma2 = lemmatizer.lemmatize(bigram[1])
        lemmatized_bigrams.append([lemma1, lemma2])
    
    # Create Dictionary 
    id2word = corpora.Dictionary(lemmatized_bigrams) 

    # Create Corpus 
    texts = lemmatized_bigrams

    corpus = [id2word.doc2bow(text) for text in texts]

    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
    doc_lda = lda_model[corpus]

    # Compute perplexity
    perplexity_lda = lda_model.log_perplexity(corpus)

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts = lemmatized_bigrams, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()

    return lda_model, perplexity_lda, coherence_lda


In [110]:
def walk_tree(node, depth):
    if node.n_lefts + node.n_rights > 0:
        return max([walk_tree(child, depth + 1) for child in node.children], default=depth)
    else:
        return depth
    
def build_dependency_tree(article):
    doc = nlp_spacy(article)
    depths = {}
    tree_lengths = {}
    for sent in doc.sents:
        root = sent.root
        depth = walk_tree(root, 0)
        depths[root.orth_] = depth
        tree_lengths[sent.text.strip()] = depth

    lengths = list(tree_lengths.values())
    avg_length = sum(lengths) / len(lengths)
    max_length = max(lengths)
    min_length = min(lengths)
    max_depth = max(depths.values())
    max_depth_words = [word for word, depth in depths.items() if depth == max_depth]
    return tree_lengths, max_depth, max_depth_words, avg_length, max_length, min_length

In [111]:
def process_article(url):
    article, title, text =  get_article_info(url)
    sentences, filtered_words, filtered_sentences, stop_words_per_sentence, num_stop_words_per_sentence, avg_stop_words_per_sentence, total_words, num_stop_words, max_stop_words_per_sentence, min_stop_words_per_sentence, avg_stop_words_per_sentence_avg, num_words_per_sentence, total_adjectives, avg_adjectives  = preprocess_text(article)
    s_sentiment_scores = stanza_sentiment_analysis(text)
    v_avg_scores, v_max_scores, v_min_scores, v_std_scores = vader_sentiment_analysis(sentences)
    mpqa_avg_score, mpqa_max_score, mpqa_min_score, mpqa_sd_score = mpqa_sentiment_analysis(article)
    sentiwordnet_final_score = sentiwordnet_sentiment_analysis(article)
    metrics = readability_analysis(article)
    lda_model, perplexity_lda, coherence_lda = gensim_lda_algorithm(filtered_words)
    tree_lengths, max_depth, max_depth_words, avg_length, max_length, min_length = build_dependency_tree(article)
    return {
        'title': title,
        'num_stop_words': num_stop_words,
        'total_words': total_words,
        'max_stop_words_per_sentence': max_stop_words_per_sentence,
        'min_stop_words_per_sentence': min_stop_words_per_sentence,
        'avg_stop_words_per_sentence': avg_stop_words_per_sentence,
        'avg_stop_words_per_sentence_avg': avg_stop_words_per_sentence_avg,
        'filtered_sentences': filtered_sentences,
        'stop_words_per_sentence': stop_words_per_sentence,
        'num_words_per_sentence': num_words_per_sentence,
        'num_stop_words_per_sentence': num_stop_words_per_sentence,
        'total_adjectives': total_adjectives,
        'avg_adjectives': avg_adjectives,
        's_sentiment_scores': s_sentiment_scores,
        'v_avg_scores': v_avg_scores,
        'v_max_scores': v_max_scores,
        'v_min_scores': v_min_scores,
        'v_std_scores': v_std_scores,
        'mpqa_avg_score': mpqa_avg_score,
        'mpqa_max_score': mpqa_max_score,
        'mpqa_min_score': mpqa_min_score,
        'mpqa_sd_score': mpqa_sd_score,
        'sentiwordnet_final_score': sentiwordnet_final_score,
        'metrics': metrics,
        'lda_model': lda_model,
        'perplexity_lda': perplexity_lda,
        'coherence_lda': coherence_lda,
        'tree_lengths': tree_lengths,
        'max_depth': max_depth,
        'max_depth_words': max_depth_words,
        'avg_length': avg_length,
        'max_length': max_length,
        'min_length': min_length
    }

In [112]:
def calculate_scores_old(urls, directory):
    # Create the output directory if it doesn't exist
    os.makedirs(directory, exist_ok=True)

    for url in urls:
        results = process_article(url)
        # Write preprocessed article to a separate file for each URL
        file_path = f'{directory}/{results["title"]}.txt'

        with open(file_path, 'w') as f:
            # Save the information for each sentence to the file
            for i, sentence in enumerate(results['filtered_sentences']):
                f.write(f"Sentence {i+1}: {sentence}\n")
                f.write(f"Total words: {results['num_words_per_sentence'][i]}\n")
                f.write(f"Filtered words: {sentence.split()}\n")
                f.write(f"Number of filtered words: {len(sentence.split())}\n")
                f.write(f"Stop words: {results['stop_words_per_sentence'][i]}\n")
                f.write(f"Number of stop words: {results['num_stop_words_per_sentence'][i]}\n")
                f.write(f"Average number of stop words per sentence: {round(results['avg_stop_words_per_sentence'][i], 2)}\n")
                f.write(f"Sentiment score: {results['s_sentiment_scores'][i]}\n\n")
                #f.write(f"Depth: {results['tree_lengths'][i]}\n")
                

            # Save the general statistics on stop words to the file
            f.write(f"Total number of words: {results['total_words']}\n")
            f.write(f"Total number of stop words: {results['num_stop_words']}\n")
            f.write(f"Maximum number of stop words per sentence: {results['max_stop_words_per_sentence']}\n")
            f.write(f"Minimum number of stop words per sentence: {results['min_stop_words_per_sentence']}\n")
            f.write(f"Average number of stop words per article: {round(results['avg_stop_words_per_sentence_avg'], 2)}\n")

            # Print POS tagging operations
            f.write(f"Total adjectives: {results['total_adjectives']}\n")
            f.write(f"Average number of adjectives in the article: {results['avg_adjectives']:.2f}\n\n")
            
            # Stanza sentiment scores
            f.write(f"Stanza Average of sentiment score for all sentences: {sum(results['s_sentiment_scores']) / len(results['s_sentiment_scores'])}\n")
            f.write(f"Stanza Maximum sentiment score: {max(results['s_sentiment_scores'])}\n")
            f.write(f"Stanza Minimum sentiment score: {min(results['s_sentiment_scores'])}\n")
            f.write(f"Stanza Standard deviation: {statistics.stdev(results['s_sentiment_scores'])}\n\n")

            # Vader sentiment scores
            f.write(f"Vader average scores: {results['v_avg_scores']}\n")
            f.write(f"Vader maximum scores: {results['v_max_scores']}\n")
            f.write(f"Vader minimum scores: {results['v_min_scores']}\n")
            f.write(f"Vader standard deviation scores: {results['v_std_scores']}\n\n")

            # MPQA sentiment scores
            f.write(f"MPQA average scores: {results['mpqa_avg_score']}\n")
            f.write(f"MPQA maximum scores: {results['mpqa_max_score']}\n")
            f.write(f"MPQA minimum scores: {results['mpqa_min_score']}\n")
            f.write(f"MPQA standard deviation scores: {results['mpqa_sd_score']}\n\n")

            # Sentiword sentiment scores
            f.write(f"Sentiwordnet score: {results['sentiwordnet_final_score']} (from -1 to 1, and score of 0 indicates a neutral sentiment.)\n\n")
            
            # Flesch_Kincaid scores
            f.write(f"Flesch-Kincaid score: {results['metrics']['flesch_kincaid'].score}\n")
            f.write(f"The estimated reading level of the article is: {results['metrics']['flesch_kincaid'].grade_level}\n\n") 

            # Flesch Reading ease scores
            f.write(f"Flesch Reading Ease score: {results['metrics']['flesch_reading'].score}\n")
            f.write(f"The article is classified as: {results['metrics']['flesch_reading'].ease}\n\n")

            # Print the Dale-Chall scores
            f.write(f"Dale-Chall Readability score: {results['metrics']['dale_chall'].score}\n")
            # Print the estimated grade levels for comprehension
            f.write(f"The estimated comprehension level for different grade levels is: {results['metrics']['dale_chall'].grade_levels}\n\n")

            # Print the ARI scores
            f.write(f"Automated Readability Index (ARI) score: {results['metrics']['ari'].score}, which corresponds to a grade level of {results['metrics']['ari'].grade_levels}.\n")
            f.write(f"This means that the text can be read by someone who is around {results['metrics']['ari'].ages} years old.\n\n")

            # Print the Coleman-Liau scores
            f.write(f"Coleman-Liau Index Score: {results['metrics']['coleman_liau'].score}\n")
            f.write(f"Estimated Grade Level: {results['metrics']['coleman_liau'].grade_level}\n\n")

            # Print the Gunning Fog scores
            f.write(f"Gunning Fog score: {results['metrics']['gunning_fog'].score}\n")
            f.write(f"The estimated grade level for comprehension is: {results['metrics']['gunning_fog'].grade_level}\n\n")

            # Print the SMOG scores
            #f.write(f"SMOG score: {results['metrics']['smog'].score}. This corresponds to a grade level of {results['metrics']['smog'].grade_level}.")
            
            # Print the SPACHE scores
            f.write(f"SPACHE score: {results['metrics']['spache'].score}\n")
            f.write(f"This corresponds to a grade level of {results['metrics']['spache'].grade_level}.\n\n")

            # Print the Linsear Write Index scores
            f.write(f"Linsear Write Index score: {results['metrics']['linsear_write'].score}\n")
            f.write("Approximate grade level equivalent: {}\n\n".format(results['metrics']['linsear_write'].grade_level))

            # Gensim-LDA analysis
            f.write(f"Perplexity (how well the LDA model predicts the corpus) of the article: {results['perplexity_lda']}\n")
            f.write(f"Coherence (how coherent the topics are) of the article: {results['coherence_lda']}\n\n")

            # Dependency tree height
            f.write(f"Max tree depth: {results['max_depth']}\n")
            f.write(f"Words at max depth: {', '.join(results['max_depth_words'])}\n")
            f.write(f"Average tree length: {results['avg_length']:.2f}\n")
            f.write(f"Maximum tree length: {results['max_length']}\n")
            f.write(f"Minimum tree length: {results['min_length']}\n\n")


In [113]:
def calculate_scores(urls, directory):
    # Preprocessed directory 
    os.makedirs(directory, exist_ok=True)

    for url in urls:
        results = process_article(url)

        # Create a directory for the article
        article_directory = f'{directory}/{results["title"]}'
        os.makedirs(article_directory, exist_ok=True)

        # Create a list to store the information for each sentence
        sentence_info = []

        # Append the information for each sentence to the list
        for i, sentence in enumerate(results['filtered_sentences']):
            sentences_info = {
                'Sentence': f'Sentence {i+1}',
                'Total words': results['num_words_per_sentence'][i],
                'Filtered words': sentence.split(),
                'Number of filtered words': len(sentence.split()),
                'Stop words': results['stop_words_per_sentence'][i],
                'Number of stop words': results['num_stop_words_per_sentence'][i],
                'Average number of stop words per sentence': round(results['avg_stop_words_per_sentence'][i], 2),
                'Sentiment score': results['s_sentiment_scores'][i]
            }
            sentence_info.append(sentences_info)

        # Create a list to store the general statistics on stop words
        general_stats = [
            {
                'Total number of words': results['total_words'],
                'Total number of stop words': results['num_stop_words'],
                'Maximum number of stop words per sentence': results['max_stop_words_per_sentence'],
                'Minimum number of stop words per sentence': results['min_stop_words_per_sentence'],
                'Average number of stop words per article': round(results['avg_stop_words_per_sentence_avg'], 2)
            }
        ]   

        # Create a list to store the POS tagging operations
        pos_tagging_ops = [
            {
                'Total adjectives': results['total_adjectives'],
                'Average number of adjectives in the article': round(results['avg_adjectives'], 2)
            }
        ]

        # Create a list to store the sentiment scores
        sentiment_scores = [
            {
                'Stanza Average of sentiment score for all sentences': sum(results['s_sentiment_scores']) / len(results['s_sentiment_scores']),
                'Stanza Maximum sentiment score': max(results['s_sentiment_scores']),
                'Stanza Minimum sentiment score': min(results['s_sentiment_scores']),
                'Stanza Standard deviation': statistics.stdev(results['s_sentiment_scores']),
                'Vader average scores': results['v_avg_scores'],
                'Vader maximum scores': results['v_max_scores'],
                'Vader minimum scores': results['v_min_scores'],
                'Vader standard deviation scores': results['v_std_scores'],
                'MPQA average scores': results['mpqa_avg_score'],
                'MPQA maximum scores': results['mpqa_max_score'],
                'MPQA minimum scores': results['mpqa_min_score'],
                'MPQA standard deviation scores': results['mpqa_sd_score'],
                'Sentiwordnet score': results['sentiwordnet_final_score']
            }
        ]

        # Create a list to store the Flesch-Kincaid scores
        readability_scores = [
            {
                'Flesch-Kincaid score': results['metrics']['flesch_kincaid'].score,
                'Estimated reading level of the article': results['metrics']['flesch_kincaid'].grade_level,
                'Flesch-Reading score': results['metrics']['flesch_reading'].score,
                'The article is classified as': results['metrics']['flesch_reading'].ease,
                'Dale-Chall Readability score': results['metrics']['dale_chall'].score,
                'The estimated comprehension level for different grade levels': results['metrics']['dale_chall'].grade_levels,
                'Automated Readability Index (ARI) score': results['metrics']['ari'].score, 
                'It corresponds to a grade level of': results['metrics']['ari'].grade_levels,
                'This means that the text can be read by someone who is around': results['metrics']['ari'].ages,
                'Coleman-Liau Index Score': results['metrics']['coleman_liau'].score,
                'Estimated Grade Level': results['metrics']['coleman_liau'].grade_level,
                'Gunning Fog score': results['metrics']['gunning_fog'].score,
                'The estimated grade level for comprehension is': results['metrics']['gunning_fog'].grade_level,
                'SPACHE score': results['metrics']['spache'].score,
                'This corresponds to a grade level of': results['metrics']['spache'].grade_level,
                'Linsear Write Index score': results['metrics']['linsear_write'].score,
                'Approximate grade level equivalent': results['metrics']['linsear_write'].grade_level,
                'Perplexity (how well the LDA model predicts the corpus) of the article': results['perplexity_lda'],
                'Coherence (how coherent the topics are) of the article': results['coherence_lda']
            }
        ]

        # Create a list to store the dependency tree scores
        dependency_tree_scores = [
            {
                'Max tree depth': results['max_depth'],
                'Words at max depth': ', '.join(results['max_depth_words']),
                'Average tree length': results['avg_length'],
                'Maximum tree length': results['max_length'],
                'Minimum tree length': results['min_length']
            }
        ]
        
        with open(f'{article_directory}/sentences_info.csv', mode='a', newline='', encoding='utf-8') as file:
            fieldnames = ['Sentence', 'Total words', 'Filtered words', 'Number of filtered words', 'Stop words', 'Number of stop words', 'Average number of stop words per sentence', 'Sentiment score']
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            if file.tell() == 0:
                writer.writeheader()
            for sentence in sentence_info:
                if isinstance(sentence, dict):
                    writer.writerow(sentence)
        # Print contents of the CSV file
        with open(f'{article_directory}/sentences_info.csv', mode='r', newline='', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            rows = list(reader)
        
        # Print the contents of the CSV file as a formatted table
        print(tabulate(rows, headers='keys'))
        

        # Write the general statistics on stop words to a CSV file
        with open(f'{article_directory}/general_stats.csv', mode='a', newline='', encoding='utf-8') as file:
            fieldnames = ['Total number of words', 'Total number of stop words', 'Maximum number of stop words per sentence', 'Minimum number of stop words per sentence', 'Average number of stop words per article']
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            if file.tell() == 0:
                writer.writeheader()
            for stats in general_stats:
                writer.writerow(stats)
        # Print contents of the CSV file
        with open(f'{article_directory}/general_stats.csv', mode='r', newline='', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            rows = list(reader)
        
        # Print the contents of the CSV file as a formatted table
        print(tabulate(rows, headers='keys'))

        # Write the general statistics on POS tagging to a CSV file
        with open(f'{article_directory}/pos_tagging.csv', mode='a', newline='', encoding='utf-8') as file:
            fieldnames = ['Total adjectives', 'Average number of adjectives in the article']
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            if file.tell() == 0:
                writer.writeheader()
            for stats in pos_tagging_ops:
                writer.writerow(stats)
        # Print contents of the CSV file
        with open(f'{article_directory}/pos_tagging.csv', mode='r', newline='', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            rows = list(reader)
        
        # Print the contents of the CSV file as a formatted table
        print(tabulate(rows, headers='keys'))
        

        # Write the sentiment scores to a CSV file
        with open(f'{article_directory}/sentiment_scores.csv', mode='a', newline='', encoding='utf-8') as file:
            fieldnames = ['Stanza Average of sentiment score for all sentences', 'Stanza Maximum sentiment score', 'Stanza Minimum sentiment score', 'Stanza Standard deviation', 'Vader average scores', 'Vader maximum scores', 'Vader minimum scores', 'Vader standard deviation scores', 'MPQA average scores', 'MPQA maximum scores', 'MPQA minimum scores', 'MPQA standard deviation scores', 'Sentiwordnet score']
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            if file.tell() == 0:
                writer.writeheader()
            for stats in sentiment_scores:
                writer.writerow(stats)
        # Print contents of the CSV file
        with open(f'{article_directory}/sentiment_scores.csv', mode='r', newline='', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            rows = list(reader)
        
        # Print the contents of the CSV file as a formatted table
        print(tabulate(rows, headers='keys'))
        
        # Write the readability scores to a CSV file
        with open(f'{article_directory}/readability_scores.csv', mode='a', newline='', encoding='utf-8') as file:
            fieldnames = ['Flesch-Kincaid score', 'Estimated reading level of the article', 'Flesch-Reading score', 'The article is classified as', 'Dale-Chall Readability score', 'The estimated comprehension level for different grade levels', 'Automated Readability Index (ARI) score', 'It corresponds to a grade level of', 'This means that the text can be read by someone who is around', 'Coleman-Liau Index Score', 'Estimated Grade Level', 'Gunning Fog score', 'The estimated grade level for comprehension is', 'SPACHE score', 'This corresponds to a grade level of', 'Linsear Write Index score', 'Approximate grade level equivalent', 'Perplexity (how well the LDA model predicts the corpus) of the article', 'Coherence (how coherent the topics are) of the article']
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            if file.tell() == 0:
                writer.writeheader()
            for stats in readability_scores:
                writer.writerow(stats)
        # Print contents of the CSV file
        with open(f'{article_directory}/readability_scores.csv', mode='r', newline='', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            rows = list(reader)
        
        # Print the contents of the CSV file as a formatted table
        print(tabulate(rows, headers='keys'))
        
        # Write the dependency tree scores to a CSV file
        with open(f'{article_directory}/dependency_tree_scores.csv', mode='a', newline='', encoding='utf-8') as file:
            fieldnames = ['Max tree depth', 'Words at max depth', ', ', 'Average tree length', 'Maximum tree length', 'Minimum tree length']
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            if file.tell() == 0:
                writer.writeheader()
            for stats in dependency_tree_scores:
                writer.writerow(stats)
        # Print contents of the CSV file
        with open(f'{article_directory}/dependency_tree_scores.csv', mode='r', newline='', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            rows = list(reader)
        
        # Print the contents of the CSV file as a formatted table
        print(tabulate(rows, headers='keys'))
    

In [114]:
urls = ['https://www.foxnews.com/politics/republicans-respond-after-irs-whistleblower-says-hunter-biden-investigation-being-mishandled',
        'https://news.yahoo.com/alabama-education-director-ousted-over-234450832.html',
        'https://news.yahoo.com/samantha-cameron-remind-david-steer-050000235.html']

calculate_scores(urls, directory='processed articles')

Sentence       Total words  Filtered words                                                                                                                                                                                                                                                                                                                                                           Number of filtered words  Stop words                                                                                                         Number of stop words    Average number of stop words per sentence    Sentiment score
-----------  -------------  ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------