### Sentiment analysis

**⛳️ Goal**: Analyzing the sentiments of the speeches using an unsupervised machine learning technique.

In [10]:
from pathlib import Path
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from datetime import datetime
import os
import sys
import csv
import re
import stanza
import pandas as pd

# Basic paths
src_path = str(Path.cwd().parent / "pdfs")
sys.path.append(src_path)
src_path = str(Path.cwd().parent / "src")
sys.path.append(src_path)

project_path = Path().resolve().parent
csv_path = project_path / "speeches_csv"

# Other basics
stanza.download('en',verbose=False)

In [11]:
# Functions to call the models

# Stanza
def stanza_fn (string, j, max_j):
    now = datetime.now()
    time = now.strftime("%H:%M:%S")
    if j == 0:
        print(f"Stanza starts working at {time}")
    print(f"\rStanza working in speech {j} at {time}", end="")         # This reprint the line in the same space
        
    nlp = stanza.Pipeline('en', processors='tokenize, mwt, pos, lemma, depparse,sentiment',
                           use_gpu=False, verbose=False, pos_batch_size=3000) 
    doc = nlp(string)
    doc_sent = []
    for i, sentence in enumerate(doc.sentences):        
        doc_sent.append(sentence.sentiment)
    result = (sum(doc_sent)/len(doc_sent)) - 1        # Change the reference
    
    time = now.strftime("%H:%M:%S")
    if j == max_j:
        print(f"\nStanza finished working at {time}")
            
    return result          # 0 negative, 1 neutral, 2 positive. Now -1 negative, 0 neutral, 1 positive


# TextBlob
def textblob_fn (string, j, max_j):
    now = datetime.now()
    time = now.strftime("%H:%M:%S")
    if j == 0:
        print(f"TextBlob starts working at {time}")
    print(f"\rTextBlob working in speech {j} at {time}", end="")      # This reprint the line in the same space
        
    tb_speech = TextBlob(string)
    result = round(tb_speech.polarity, 3)
    
    time = now.strftime("%H:%M:%S")
    if j == max_j:
        print(f"\nTextBlob finished working at {time}")
    
    return result          # -1 negative, 1 positive


# Vader
def vader_fn (string, j, max_j):
    now = datetime.now()
    time = now.strftime("%H:%M:%S")
    if j == 0:
        print(f"Vader starts working at {time}")
    print(f"\rVader working in speech {j} at {time}", end="")        # This reprint the line in the same space
        
    analyser = SentimentIntensityAnalyzer()
    score = analyser.polarity_scores(string)
    result = score["compound"]                                       # Author says that is the main statistic you need to see (-1 negative, 1 positive, between -0.05 and 0.05 neutral)
    
    time = now.strftime("%H:%M:%S")
    if j == max_j:
        print(f"\nVader finished working at {time}")
        
    return result


In [12]:
# Adquire the speeches and clean for speeches (not interviews, to deal in the future)
df = pd.read_csv(csv_path / "all_speeches.txt")
OneSpeaker = df[ df["highest_speaker_count"] == 0 ]
OneSpeaker.reset_index(drop=True, inplace=True)
OneSpeaker

Unnamed: 0,title,pages,date,location,highest_speaker_count,content
0,Prayer_Breakfast_2016,7,4 February 2016,"Washington Hilton, Washington, D.C.",0,"Well, good morning. Giving all praise and hono..."
1,Security_Team_Announcement,5,1 December 2008,"Chicago, Illinois",0,"Good morning, everybody. I hope you all had a ..."
2,Cairo_University,14,4 June 2009,"Cairo, Egypt",0,Thank you so much. Good afternoon. I am honore...
3,Umpqua_Community_College_Shootings,4,1 October 2015,"Washington, D.C.",0,There's been another mass shooting in America ...
4,White_House_Correspondent_Dinner_2013,6,27 April 2013,"Washington Hilton Hotel, Washington, D.C.",0,"Thank you. Thank you, everybody. How do you li..."
...,...,...,...,...,...,...
283,Shimon_Peres_Memorial,6,30 September 2016,"Mount Herzl, Jerusalem, Israel",0,"Zvia, Yoni, Chemi and generations of the Peres..."
284,ASEAN_Business_2015,9,21 November 2015,"Shangri-La Hotel, Kuala Lumpur, Malaysia",0,"Thank you so much. Please be seated. Well, goo..."
285,Finance_Crisis_Fee,3,14 January 2010,unknown_location,0,"Thank you, everybody, for being here. As we al..."
286,Oval_Office_Counter_Terrorism_Agenda,5,6 December 2015,"Oval Office, The White House",0,"Good evening. On Wednesday, 14 Americans were ..."


In [13]:
# Test how to apply a function in a pandas dataframe and not break stuff (check cell 2 for context)
def clean_txt(string):
    string = string.replace(" -- ", ", ")
    return string

## This is to replace the content column in pandas
## Pandas is stupid and has issues with naming things with the same variable name
a = OneSpeaker["content"].map(lambda x: clean_txt(x))
a = a.to_list()
OneSpeaker = OneSpeaker.assign(content = a)

In [14]:
# Apply the sentiment model function to all content and get a score dataframe

cont_save = OneSpeaker["content"].to_list()
stop = len(cont_save)       # Change this to use all dataset

In [16]:
# Run the Stanza sentiment analysis processor
# Warning: It takes a lot of time!! 
st = [ stanza_fn(cont_save[i], i, stop-1) for i in range(stop) ]   # The minus one if because python starts at 0

Stanza starts working at 12:43:50
Stanza working in speech 287 at 14:35:22
Stanza finished working at 14:35:22


In [17]:
# Run the TextBlob sentiment analysis processor
# Quick to run
tb = [ textblob_fn(cont_save[i], i, stop-1) for i in range(stop) ] # The minus one if because python starts at 0

TextBlob starts working at 14:51:03
TextBlob working in speech 287 at 14:51:08
TextBlob finished working at 14:51:08


In [18]:
# Run the Vader sentiment analysis processor
# Warning: Faster than Stanza, slower than Textblob
vd = [ vader_fn(cont_save[i], i, stop-1) for i in range(stop) ]    # The minus one if because python starts at 0

Vader starts working at 14:51:11
Vader working in speech 287 at 14:54:40
Vader finished working at 14:54:40


In [19]:
# Organize the result and display them as dataframe
st = [ round(num, 3) for num in st]
vd = [ round(num, 3) for num in vd]

summary_data = {"title": OneSpeaker["title"].to_list(),
                "date": OneSpeaker["date"].to_list(),
                "stanza": st,
                "textblob": tb,
                "vader": vd} 

summary_df = pd.DataFrame(summary_data)
summary_df

Unnamed: 0,title,date,stanza,textblob,vader
0,Prayer_Breakfast_2016,4 February 2016,0.099,0.178,1.000
1,Security_Team_Announcement,1 December 2008,0.140,0.162,0.998
2,Cairo_University,4 June 2009,-0.150,0.110,1.000
3,Umpqua_Community_College_Shootings,1 October 2015,-0.250,0.061,-0.998
4,White_House_Correspondent_Dinner_2013,27 April 2013,-0.012,0.145,1.000
...,...,...,...,...,...
283,Shimon_Peres_Memorial,30 September 2016,-0.083,0.164,1.000
284,ASEAN_Business_2015,21 November 2015,-0.050,0.164,1.000
285,Finance_Crisis_Fee,14 January 2010,-0.412,0.080,0.987
286,Oval_Office_Counter_Terrorism_Agenda,6 December 2015,-0.134,0.076,-1.000


In [20]:
# Save the results as csv 
save_path = project_path / "Sentiment Analysis"

summary_df.to_csv(save_path / "results_sa_date.txt", index=False)

### Ideas to do before next reunion
- Check why VADER gave different results
- Maybe plot something with this
- Maybe do an exploration on the most common words (and their synonyms) to explain the sentiment or correlate speeches
- Explore external parameters to explain the mood