In [None]:
#Run these code for first time PYthon users, or if you have never run libraries before.
#If you have used these libraries before, ignore this line
!pip install vaderSentiment
!pip install nltk
!pip install matplotlib
!pip install seaborn
!pip install pandas

In [53]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [54]:
#Replace the csv_file_path with the computer path to the folder where the article files are located.
#In given code, the articles are directly in Downloads folder of the computer.
csv_file_path = '/Users/Admin/Downloads/'

df_1920 = pd.read_csv(csv_file_path + "articles1920.csv", delimiter=';')
df_2122 = pd.read_csv(csv_file_path + "articles2122.csv", delimiter=';')
df_23 = pd.read_csv(csv_file_path + "articles23.csv", delimiter=';', encoding='latin1')
df = pd.concat([df_1920, df_2122, df_23], ignore_index= True)

In [55]:
#Remove txt in FileNames
df['FileNames'] = df['FileNames'].str.replace('txt', '')

#Function to adjust mismatch in how article months are labelled (Jan 20XX -> 20xx_Jan)
def convert_date_format(date_str):
    if re.match(r'[a-zA-Z]+ \d{4}', date_str):
        month, year = date_str.split()
        month_mapping = {'Jan': '1', 'Feb': '2', 'Mar': '3', 'Apr': '4', 'May': '5', 'Jun': '6',
                         'Jul': '7', 'Aug': '8', 'Sep': '9', 'Oct': '10', 'Nov': '11', 'Dec': '12'}
        month_numeric = month_mapping[month]
        return f'{year}_{month_numeric}'
    else:
        return date_str
    
df['DirectoryName'] = df['DirectoryName'].apply(convert_date_format)

In [56]:
analyzer = SentimentIntensityAnalyzer()

def get_vader_sentiment(text):
    vs = analyzer.polarity_scores(text)
    
    # Return the compound score
    return vs['compound']

#Get text sentiment (since titles are simple enough to apply VADER directly)
df['Title_Sentiment'] = df['FileNames'].apply(get_vader_sentiment)

In [57]:
# Download stopwords and wordnet data
nltk.download('stopwords')
nltk.download('wordnet')

# Get the English stopwords
stop_words = set(stopwords.words('english'))

#Remove English stop words from body text
def preprocess_text(text, stop_words):
    text = text.lower()
    words = text.split()
    
    cleaned_words = []
    for word in words:
        word = re.sub(r'\W', '', word) 
        if word and word not in stop_words:
            cleaned_words.append(word)
            
    cleaned_text = ' '.join(cleaned_words)
    
    return cleaned_text

#Remove potential URLs in body text
def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

df['Cleaned_Text'] = df['Content'].apply(remove_urls)
df['Final_Text'] = df['Cleaned_Text'].apply(lambda x: preprocess_text(x, stop_words))
df['Text_Sentiment'] = df['Final_Text'].apply(get_vader_sentiment)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [58]:
#Collect aggregated sentiment (text & body), then make them dataframes

Title = df.groupby('DirectoryName')['Title_Sentiment'].mean()
Body = df.groupby('DirectoryName')['Text_Sentiment'].mean()

Title = Title.reset_index()
Title.columns = ['DirectoryName', 'Title_Sentiment']

Body = Body.reset_index()
Body.columns = ['DirectoryName', 'Body_Sentiment']

In [None]:
#Title text sentiment score
Title['DirectoryName'] = pd.to_datetime(Title['DirectoryName'], format='%Y_%m').dt.strftime('%Y_%m')
Title = Title.sort_values('DirectoryName')

#Visualization
plt.figure(figsize=(10, 6))
sns.lineplot(x='DirectoryName', y='Title_Sentiment', data=Title, marker=None)
plt.xticks(rotation=90, ha='right')
plt.title('Sentiment Trend - Title Sentiment')
plt.xlabel('Month')
plt.ylabel('Aggregated Sentiment')
plt.grid(True)
plt.tight_layout()
plt.show()

#For some reason seaborn shows a lot of warnings, just run the code twice so warnings don't show again
warnings.filterwarnings("ignore")

In [None]:
#Body text sentiment score
Body['DirectoryName'] = pd.to_datetime(Body['DirectoryName'], format='%Y_%m').dt.strftime('%Y_%m')
Body = Body.sort_values('DirectoryName')

#Visualization text sentiment
plt.figure(figsize=(10, 6))
sns.lineplot(x='DirectoryName', y='Body_Sentiment', data=Body, marker=None)
plt.xticks(rotation=90, ha='right')
plt.title('Sentiment Trend - Body Sentiment')
plt.xlabel('Month')
plt.ylabel('Aggregated Sentiment')
plt.grid(True)
plt.tight_layout()
plt.show()

In [27]:
Title.to_csv('Title.csv', index=False)

In [28]:
Body.to_csv('Body.csv', index=False)