# Automated Pre-processing Text Article, and saving the score in a txt file for each article

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('sentiwordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import sentiwordnet as swn

from nltk.sentiment.vader import SentimentIntensityAnalyzer

import stanza
stanza.download('en')  # Download the English model

import statistics
import numpy as np
import pandas as pd

import re

import requests
from bs4 import BeautifulSoup

import newspaper

import torch
torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)

import os

[nltk_data] Downloading package punkt to /home/pierluigi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-04-29 13:00:13 INFO: Downloading default packages for language: en (English) ...
2023-04-29 13:00:14 INFO: File exists: /home/pierluigi/stanza_resources/en/default.zip
2023-04-29 13:00:19 INFO: Finished downloading models and saved to /home/pierluigi/stanza_resources.


In [3]:
# Setting the use_gpu=False, it uses the CPU instead of the GPU for calculating stuff, and also for printing the results. And it couldn't run out of memory.
nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment', tokenize_no_ssplit=False, max_split_size_mb=15, use_gpu=False)

2023-04-29 13:00:27 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-04-29 13:00:28 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| sentiment | sstplus  |

2023-04-29 13:00:28 INFO: Using device: cpu
2023-04-29 13:00:28 INFO: Loading: tokenize
2023-04-29 13:00:28 INFO: Loading: sentiment
2023-04-29 13:00:28 INFO: Done loading processors!


In [4]:
# Load the MPQA lexicon
lexicon = pd.read_csv("/home/pierluigi/Documents/echo_chambers_intership/Code analysis/NLP/Single modules/subjclueslen1-HLTEMNLP05.tff", sep=" ", header=None, 
                      names=["type", "len", "word", "pos", "stemmed", "polarity", "strength"])

lexicon["type"] = lexicon["type"].str[5:]
lexicon["word"] = lexicon["word"].str[len("word1="):]
lexicon["polarity"] = lexicon["polarity"].str[len("priorpolarity="):]
cols_to_remove = ["len", "pos", "stemmed", "strength"]
lexicon = lexicon.drop(columns=cols_to_remove)
lexicon["type"] = lexicon["type"].replace("weaksubj", 1)
lexicon["type"] = lexicon["type"].replace("strongsubj", 2)
lexicon["polarity"] = lexicon["polarity"].replace("negative", -1)
lexicon["polarity"] = lexicon["polarity"].replace("positive", 1)
lexicon["polarity"] = lexicon["polarity"].replace("both", 0)
lexicon["polarity"] = lexicon["polarity"].replace("neutral", 0)

In [5]:
def preprocess_article(url):
    # Create a newspaper Article object
    article = newspaper.Article(url)

    # Download and parse the article
    article.download()
    article.parse()

    # Extract the title, subtitle, description, and main text
    title = article.title.strip()
    subtitle = article.meta_data.get("description", "").strip()
    description = article.meta_description.strip()
    text = article.text.strip()

    # Set the subtitle to the description if it is empty
    if not subtitle:
        subtitle = description.strip()

    # Concatenate the extracted strings
    article_text = f"{title}\n\n{subtitle}\n\n{text}"
    # Tokenize the text into sentences
    sentences = sent_tokenize(article_text)
    
    # Identify the stop words for each sentence
    num_stop_words_per_sentence = []
    stop_words_per_sentence = []
    filtered_sentences = []
    num_words_per_sentence = []
    avg_stop_words_per_sentence = []
    total_words = 0

    # Create a Porter stemmer object
    stemmer = PorterStemmer()
    stemmed_sentences = []

    # Process the text with the pipeline and extract the sentiment for each sentence
    doc = nlp(text)
    sentiment_scores = []

    # initialize the Vader sentiment analyzer
    analyzer = SentimentIntensityAnalyzer()
    v_scores_list = []

    # MPQA analysis
    mpqa_scores = []
    
    for i, sentence in enumerate(sentences):
        # Tokenize the sentence into words
        words = word_tokenize(sentence)
        all_words = len(words)
        total_words += all_words
        
        # Identify the stop words in the sentence
        stop_words = set(stopwords.words('english'))
        stop_words_found = [word for word in words if word.lower() in stop_words]
        all_stop_words = len(stop_words_found)
        filtered_words = [word for word in words if word.lower() not in stop_words]
        
        # Add the number of stop words and filtered sentence to the output
        num_stop_words = all_words - len(filtered_words)
        num_stop_words_per_sentence.append(num_stop_words)
        stop_words_per_sentence.append(stop_words_found)
        filtered_sentences.append(" ".join(filtered_words))
        num_words_per_sentence.append(all_words)
        
        # Calculate the average number of stop words per sentence
        avg_stop_words_per_sentence.append(num_stop_words / all_words)

        # Perform stemming on each word using the Porter stemmer
        stemmed_words = [stemmer.stem(word) for word in words]

        # Combine the stemmed words back into a single string
        stemmed_sentence = ' '.join(stemmed_words)
        stemmed_sentences.append(stemmed_sentence)
        output_text = '\n'.join(stemmed_sentences)

        v_scores = analyzer.polarity_scores(sentence)
        v_score_list = [v_scores['neg'], v_scores['neu'], v_scores['pos']]
        v_scores_list.append(v_score_list)

    for sentence in doc.sentences:
        sentiment_scores.append(sentence.sentiment)

    for word in article_text.split():
        word = word.strip().lower()
        if word in lexicon.word.tolist():
            polarity = lexicon[lexicon.word == word].polarity.values[0]
            mpqa_scores.append(polarity)
    
    # Calculate summary statistics
    num_stop_words = sum(num_stop_words_per_sentence)
    num_sentences = len(sentences)
    avg_stop_words_per_sentence_all = num_stop_words / num_sentences
    max_stop_words_per_sentence = max(num_stop_words_per_sentence)
    min_stop_words_per_sentence = min(num_stop_words_per_sentence)
    avg_stop_words_per_word = num_stop_words / total_words
    
    # Calculate the average number of stop words per article
    avg_stop_words_per_sentence_avg = sum(avg_stop_words_per_sentence) / len(avg_stop_words_per_sentence)

    v_scores_array = np.array(v_scores_list)
    v_avg_scores = np.mean(v_scores_array, axis=0)
    v_max_scores = np.max(v_scores_array, axis=0)
    v_min_scores = np.min(v_scores_array, axis=0)
    v_std_scores = np.std(v_scores_array, axis=0)

    mpqa_avg_score = np.mean(mpqa_scores)
    mpqa_max_score = np.max(mpqa_scores)
    mpqa_min_score = np.min(mpqa_scores)
    mpqa_sd_score = np.std(mpqa_scores)

    sentiwordnet_text = article_text.lower()
    tokens = word_tokenize(sentiwordnet_text)
    tokens = [token for token in tokens if token.isalnum()]
    tokens = [token for token in tokens if not token in nltk.corpus.stopwords.words('english')]
    sentiwordnet_scores = []

    for token in tokens:
        pos_score = 0
        neg_score = 0
        synsets = swn.senti_synsets(token)
        for synset in synsets:
            pos_score += synset.pos_score()
            neg_score += synset.neg_score()
        if pos_score > neg_score:
            sentiment_score = 1
        elif neg_score > pos_score:
            sentiment_score = -1
        else:
            sentiment_score = 0
        sentiwordnet_scores.append(sentiment_score)
    assert(len(sentiwordnet_scores) == len(tokens))


    # Return the output
    return {
        'title': title,
        'num_stop_words': num_stop_words,
        'total_words': total_words,
        'stop_words_found': stop_words_found,
        'all_stop_words': all_stop_words,
        'avg_stop_words_per_sentence_all': avg_stop_words_per_sentence_all,
        'max_stop_words_per_sentence': max_stop_words_per_sentence,
        'min_stop_words_per_sentence': min_stop_words_per_sentence,
        'avg_stop_words_per_word': avg_stop_words_per_word,
        'avg_stop_words_per_sentence': avg_stop_words_per_sentence,
        'avg_stop_words_per_sentence_avg': avg_stop_words_per_sentence_avg,
        'filtered_sentences': filtered_sentences,
        'stop_words_per_sentence': stop_words_per_sentence,
        'num_words_per_sentence': num_words_per_sentence,
        'num_stop_words_per_sentence': num_stop_words_per_sentence,
        'output_text': output_text,
        'sentiment_scores': sentiment_scores,
        'v_avg_scores': v_avg_scores,
        'v_max_scores': v_max_scores,
        'v_min_scores': v_min_scores,
        'v_std_scores': v_std_scores,
        'mpqa_avg_score': mpqa_avg_score,
        'mpqa_max_score': mpqa_max_score,
        'mpqa_min_score': mpqa_min_score,
        'mpqa_sd_score': mpqa_sd_score,
        'tokens': tokens,
        'sentiwordnet_scores': sentiwordnet_scores
    }


In [6]:
def preprocess_articles(urls, directory):
    # Create the output directory if it doesn't exist
    os.makedirs(directory, exist_ok=True)
    
    for url in urls:
        results = preprocess_article(url)
        # Write preprocessed article to a separate file for each URL
        file_path = f'{directory}/{results["title"]}.txt'
        
        with open(file_path, 'w') as f:
            # Save the information for each sentence to the file
            for i, sentence in enumerate(results['filtered_sentences']):
                f.write(f"Sentence {i+1}: {sentence}\n")
                f.write(f"Total words: {results['num_words_per_sentence'][i]}\n")
                f.write(f"Filtered words: {sentence.split()}\n")
                f.write(f"Number of filtered words: {len(sentence.split())}\n")
                f.write(f"Stop words: {results['stop_words_per_sentence'][i]}\n")
                f.write(f"Number of stop words: {results['num_stop_words_per_sentence'][i]}\n")
                f.write(f"Average number of stop words per sentence: {round(results['avg_stop_words_per_sentence'][i], 2)}\n")
                f.write(f"Sentiment score: {results['sentiment_scores'][i]}\n\n")

            # Save the general statistics on stop words to the file
            f.write(f"Total number of words: {results['total_words']}\n")
            f.write(f"Total number of stop words: {results['num_stop_words']}\n")
            f.write(f"Maximum number of stop words per sentence: {results['max_stop_words_per_sentence']}\n")
            f.write(f"Minimum number of stop words per sentence: {results['min_stop_words_per_sentence']}\n")
            f.write(f"Average number of stop words per article: {round(results['avg_stop_words_per_sentence_avg'], 2)}\n\n")
            
            f.write(f"Average of sentiment score for all sentences: {sum(results['sentiment_scores']) / len(results['sentiment_scores'])}\n")
            f.write(f"Maximum sentiment score: {max(results['sentiment_scores'])}\n")
            f.write(f"Minimum sentiment score: {min(results['sentiment_scores'])}\n")
            f.write(f"Standard deviation: {statistics.stdev(results['sentiment_scores'])}\n\n")

            f.write(f"Vader average scores: {results['v_avg_scores']}\n")
            f.write(f"Vader maximum scores: {results['v_max_scores']}\n")
            f.write(f"Vader minimum scores: {results['v_min_scores']}\n")
            f.write(f"Vader standard deviation scores: {results['v_std_scores']}\n\n")

            f.write(f"MPQA average scores: {results['mpqa_avg_score']}\n")
            f.write(f"MPQA maximum scores: {results['mpqa_max_score']}\n")
            f.write(f"MPQA minimum scores: {results['mpqa_min_score']}\n")
            f.write(f"MPQA standard deviation scores: {results['mpqa_sd_score']}\n\n")

            for i, token in enumerate(results['tokens']):
                f.write(f"{i+1}. Token: {token}, Sentiment Score: {results['sentiwordnet_scores'][i]}\n")

            f.write("\nStemmed text:\n")
            f.write(f"{results['output_text']}")

In [7]:
urls = ['https://www.foxnews.com/politics/republicans-respond-after-irs-whistleblower-says-hunter-biden-investigation-being-mishandled',
        'https://news.yahoo.com/alabama-education-director-ousted-over-234450832.html',
        'https://news.yahoo.com/samantha-cameron-remind-david-steer-050000235.html']

preprocess_articles(urls, directory='preprocessed articles')