# Sentiment analysis

**⛳️ Goal**: Analyzing the sentiments of the speeches using an unsupervised machine learning technique.

## Packages

In [2]:
from pathlib import Path
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from datetime import datetime
import os
import sys
import csv
import re
import stanza
import pandas as pd

# Basic paths
src_path = str(Path.cwd().parent / "pdfs")
sys.path.append(src_path)
src_path = str(Path.cwd().parent / "src")
sys.path.append(src_path)

project_path = Path().resolve().parent
csv_path = project_path / "speeches_csv"

# Other basics
stanza.download('en',verbose=False)

## Functions

In [3]:
# Stanza
def stanza_fn (string, j, max_j):
    now = datetime.now()
    time = now.strftime("%H:%M:%S")
    if j == 0:
        print(f"Stanza starts working at {time}")
    print(f"\rStanza working in speech {j} at {time}", end="")         # This reprint the line in the same space
        
    nlp = stanza.Pipeline('en', processors='tokenize, mwt, pos, lemma, depparse,sentiment',
                           use_gpu=False, verbose=False, pos_batch_size=3000) 
    doc = nlp(string)
    doc_sent = []
    for i, sentence in enumerate(doc.sentences):        
        doc_sent.append(sentence.sentiment)
    result = (sum(doc_sent)/len(doc_sent)) - 1        # Change the reference
    
    time = now.strftime("%H:%M:%S")
    if j == max_j:
        print(f"\nStanza finished working at {time}")
            
    return result          # 0 negative, 1 neutral, 2 positive. Now -1 negative, 0 neutral, 1 positive


# TextBlob
def textblob_fn (string, j, max_j):
    now = datetime.now()
    time = now.strftime("%H:%M:%S")
    if j == 0:
        print(f"TextBlob starts working at {time}")
    print(f"\rTextBlob working in speech {j} at {time}", end="")      # This reprint the line in the same space
        
    tb_speech = TextBlob(string)
    result = round(tb_speech.polarity, 3)
    
    time = now.strftime("%H:%M:%S")
    if j == max_j:
        print(f"\nTextBlob finished working at {time}")
    
    return result          # -1 negative, 1 positive


# Vader
def vader_fn (string, j, max_j):
    now = datetime.now()
    time = now.strftime("%H:%M:%S")
    if j == 0:
        print(f"Vader starts working at {time}")
    print(f"\rVader working in speech {j} at {time}", end="")        # This reprint the line in the same space
        
    analyser = SentimentIntensityAnalyzer()
    score = analyser.polarity_scores(string)
    result = score["compound"]                                       # Author says that is the main statistic you need to see (-1 negative, 1 positive, between -0.05 and 0.05 neutral)
    
    time = now.strftime("%H:%M:%S")
    if j == max_j:
        print(f"\nVader finished working at {time}")
        
    return result


# TextBlob Subjectivity
def tbsubj_fn (string, j, max_j):
    now = datetime.now()
    time = now.strftime("%H:%M:%S")
    if j == 0:
        print(f"TextBlob starts working at {time}")
    print(f"\rTextBlob working in speech {j} at {time}", end="")      # This reprint the line in the same space
        
    tb_speech = TextBlob(string)
    result = round(tb_speech.subjectivity, 3)
    
    time = now.strftime("%H:%M:%S")
    if j == max_j:
        print(f"\nTextBlob finished working at {time}")
    
    return result          # -1 negative, 1 positive



## Data

In [4]:
# Adquire the speeches and clean for speeches (not interviews, to deal in the future)
df = pd.read_csv(csv_path / "all_speeches_cleaned.txt")
OneSpeaker = df[ df["highest_speaker_count"] <= 3 ]
OneSpeaker.reset_index(drop=True, inplace=True)
OneSpeaker

Unnamed: 0,title,pages,date,location,highest_speaker_count,content,country,count_commas,state,city,specific_location
0,Prayer_Breakfast_2016,7,2016-02-04,"Washington Hilton, Washington D.C.",0,"Well, good morning. Giving all praise and hono...",USA,1,no_state,Washington D.C.,Washington Hilton
1,Security_Team_Announcement,5,2008-12-01,"Chicago, Illinois",0,"Good morning, everybody. I hope you all had a ...",USA,1,Illinois,Chicago,no_specific_location
2,Cairo_University,14,2009-06-04,"Cairo, Egypt",0,Thank you so much. Good afternoon. I am honore...,Egypt,1,no_state,Cairo,no_specific_location
3,Umpqua_Community_College_Shootings,4,2015-10-01,Washington D.C.,0,There's been another mass shooting in America ...,USA,0,no_state,Washington D.C.,no_specific_location
4,White_House_Correspondent_Dinner_2013,6,2013-04-27,"Washington Hilton Hotel, Washington D.C.",0,"Thank you. Thank you, everybody. How do you li...",USA,1,no_state,Washington D.C.,Washington Hilton Hotel
...,...,...,...,...,...,...,...,...,...,...,...
341,Shimon_Peres_Memorial,6,2016-09-30,"Mount Herzl, Jerusalem, Israel",0,"Zvia, Yoni, Chemi and generations of the Peres...",Israel,2,no_state,Jerusalem,Mount Herzl
342,ASEAN_Business_2015,9,2015-11-21,"Shangri-La Hotel, Kuala Lumpur, Malaysia",0,"Thank you so much. Please be seated. Well, goo...",Malaysia,2,no_state,Kuala Lumpur,Shangri-La Hotel
343,Finance_Crisis_Fee,3,2010-01-14,unknown_location,0,"Thank you, everybody, for being here. As we al...",USA,0,no_state,Washington D.C.,White House
344,Oval_Office_Counter_Terrorism_Agenda,5,2015-12-06,"White House, Washington D.C.",0,"Good evening. On Wednesday, 14 Americans were ...",USA,1,no_state,Washington D.C.,White House


In [5]:
# Test how to apply a function in a pandas dataframe and not break stuff (check cell 2 for context)
def clean_txt(string):
    string = string.replace(" -- ", ", ")
    return string

## This is to replace the content column in pandas
## Pandas is stupid and has issues with naming things with the same variable name
a = OneSpeaker["content"].map(lambda x: clean_txt(x))
a = a.to_list()
OneSpeaker = OneSpeaker.assign(content = a)

In [6]:
# Apply the sentiment model function to all content and get a score dataframe

cont_save = OneSpeaker["content"].to_list()
stop = len(cont_save)       # Change this to use all dataset

## Running the processers

**Warning**: Stanza takes 1.5h-2h to run. Texblob is the fastest.

In [11]:
# Run the Stanza sentiment analysis processor
# Warning: It takes 1.5h-2h.
st = [ stanza_fn(cont_save[i], i, stop-1) for i in range(stop) ]   # The minus one if because python starts at 0

Stanza starts working at 13:48:38
Stanza working in speech 345 at 16:31:29
Stanza finished working at 16:31:29


In [8]:
# Run the TextBlob sentiment analysis processor
# Quick to run
tb = [ textblob_fn(cont_save[i], i, stop-1) for i in range(stop) ] # The minus one if because python starts at 0

TextBlob starts working at 13:28:17
TextBlob working in speech 345 at 13:28:24
TextBlob finished working at 13:28:24


In [9]:
# TextBlob for subjectivity
tbsubj = [ tbsubj_fn(cont_save[i], i, stop-1) for i in range(stop) ] 

TextBlob starts working at 13:28:34
TextBlob working in speech 345 at 13:28:40
TextBlob finished working at 13:28:40


In [10]:
# Run the Vader sentiment analysis processor
vd = [ vader_fn(cont_save[i], i, stop-1) for i in range(stop) ]    # The minus one if because python starts at 0

Vader starts working at 13:28:48
Vader working in speech 345 at 13:33:50
Vader finished working at 13:33:50


In [12]:
# Organize the result and display them as dataframe
st = [ round(num, 3) for num in st]
vd = [ round(num, 3) for num in vd]

summary_data = {"title": OneSpeaker["title"].to_list(),
                "date": OneSpeaker["date"].to_list(),
                "stanza": st,
                "textblob": tb,
                "vader": vd,
                "subjectivity": tbsubj} 

summary_df = pd.DataFrame(summary_data)
summary_df

Unnamed: 0,title,date,stanza,textblob,vader,subjectivity
0,Prayer_Breakfast_2016,2016-02-04,0.099,0.178,1.000,0.542
1,Security_Team_Announcement,2008-12-01,0.140,0.162,0.998,0.403
2,Cairo_University,2009-06-04,-0.150,0.110,1.000,0.457
3,Umpqua_Community_College_Shootings,2015-10-01,-0.250,0.061,-0.998,0.435
4,White_House_Correspondent_Dinner_2013,2013-04-27,-0.012,0.145,1.000,0.501
...,...,...,...,...,...,...
341,Shimon_Peres_Memorial,2016-09-30,-0.083,0.164,1.000,0.436
342,ASEAN_Business_2015,2015-11-21,-0.050,0.164,1.000,0.399
343,Finance_Crisis_Fee,2010-01-14,-0.412,0.080,0.987,0.436
344,Oval_Office_Counter_Terrorism_Agenda,2015-12-06,-0.134,0.076,-1.000,0.398


In [13]:
# Save the results as csv 
save_path = project_path / "sentiment_analysis"

summary_df.to_csv(save_path / "results_sa_all.txt", index=False)