# Sentiment Analysis of Financial Text

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import nltk

import matplotlib.pyplot as plt
plt.style.use('ggplot')

df = pd.read_csv(Path('input/fin-sent-data.csv'))
df

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
...,...,...
5837,RISING costs have forced packaging producer Hu...,negative
5838,Nordic Walking was first used as a summer trai...,neutral
5839,"According shipping company Viking Line , the E...",neutral
5840,"In the building and home improvement trade , s...",neutral


In [2]:
# General understanding of data distributino
df["Sentiment"].value_counts()

neutral     3130
positive    1852
negative     860
Name: Sentiment, dtype: int64

# Creating a dataframe providing polarity scores for sentences

In [3]:
#nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

In [4]:
# Run the polarity score on the entire dataset
df_length = len(df)
result = {}

for i in range(0, df_length):
    sentence = df['Sentence'][i]
    result[i] = sia.polarity_scores(sentence)

#adding scores to a dataframe (tranposing so it can be read properly)
scores_df = pd.DataFrame(result).T
scores_df

Unnamed: 0,neg,neu,pos,compound
0,0.000,0.847,0.153,0.5423
1,0.167,0.833,0.000,-0.2023
2,0.064,0.856,0.080,0.1531
3,0.000,1.000,0.000,0.0000
4,0.000,1.000,0.000,0.0000
...,...,...,...,...
5837,0.239,0.761,0.000,-0.5267
5838,0.000,1.000,0.000,0.0000
5839,0.000,0.870,0.130,0.2023
5840,0.000,0.824,0.176,0.4588


In [5]:
#adding a mergable column
scores_df['id'] = range(0, len(df))
df['id'] = range(0, len(df))
scores_df = scores_df.merge(df, how='left')
scores_df

Unnamed: 0,neg,neu,pos,compound,id,Sentence,Sentiment
0,0.000,0.847,0.153,0.5423,0,The GeoSolutions technology will leverage Bene...,positive
1,0.167,0.833,0.000,-0.2023,1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,0.064,0.856,0.080,0.1531,2,"For the last quarter of 2010 , Componenta 's n...",positive
3,0.000,1.000,0.000,0.0000,3,According to the Finnish-Russian Chamber of Co...,neutral
4,0.000,1.000,0.000,0.0000,4,The Swedish buyout firm has sold its remaining...,neutral
...,...,...,...,...,...,...,...
5837,0.239,0.761,0.000,-0.5267,5837,RISING costs have forced packaging producer Hu...,negative
5838,0.000,1.000,0.000,0.0000,5838,Nordic Walking was first used as a summer trai...,neutral
5839,0.000,0.870,0.130,0.2023,5839,"According shipping company Viking Line , the E...",neutral
5840,0.000,0.824,0.176,0.4588,5840,"In the building and home improvement trade , s...",neutral


# Determining words with intense negative polarity within the data

In [40]:
neg_df = scores_df.loc[scores_df['compound'] <= -.4, :]
neg_df = neg_df.reset_index(drop=True)
neg_df

Unnamed: 0,neg,neu,pos,compound,id,Sentence,Sentiment
0,0.231,0.655,0.114,-0.4019,11,$SAP Q1 disappoints as #software licenses down...,negative
1,0.147,0.853,0.000,-0.4767,27,"The five-storey , eco-efficient building will ...",neutral
2,0.319,0.563,0.117,-0.6486,39,$AAPL afternoon selloff as usual will be bruta...,negative
3,0.097,0.903,0.000,-0.4019,41,Dolce & Gabbana has asked the European Union t...,negative
4,0.408,0.592,0.000,-0.7351,45,L&G still paying price for dividend cut during...,negative
...,...,...,...,...,...,...,...
300,0.220,0.780,0.000,-0.4767,5732,$NIHD insiders got this one wrong. Looking for...,negative
301,0.150,0.850,0.000,-0.4389,5778,Still short $LNG from $11.70 area...next stop ...,negative
302,0.311,0.547,0.142,-0.4754,5801,"At this growth rate , paying off the national ...",negative
303,0.320,0.593,0.087,-0.7096,5802,"Reuters: Green Mountain revenue misses, shares...",negative


In [46]:
neg_polarity_words = {}

for i in range(0, len(neg_df)):
    sentence = neg_df['Sentence'][i]
    words = sentence.split()
    
    for word in words:
        word_polarity = sia.polarity_scores(word)
        polarity = word_polarity['compound'] 

        if(polarity <= -.4):
            neg_polarity_words[word] = polarity
            
neg_polarity_words

{'problem?': -0.4019,
 'gross': -0.4767,
 'brutal.': -0.6249,
 'lose': -0.4019,
 'dispute': -0.4019,
 'crisis,': -0.6249,
 'block': -0.4404,
 '187': -0.6249,
 'negative': -0.5719,
 'jeopardy': -0.4767,
 'disappointment': -0.5106,
 'infected': -0.4939,
 'casualty': -0.5267,
 'badly': -0.4767,
 'depressed': -0.5106,
 'Weak': -0.4404,
 'weak': -0.4404,
 'rejection.': -0.5423,
 'struggling': -0.4215,
 'violated': -0.5267,
 'die': -0.5994,
 'destroyed': -0.4939,
 'attack': -0.4767,
 'waste': -0.4215,
 'threatening': -0.5267,
 'stress': -0.4215,
 'burdened': -0.4019,
 'criticised': -0.4215,
 'dumps': -0.4019,
 'forced': -0.4588,
 'fears': -0.4215,
 'Broken': -0.4767,
 'problems': -0.4019,
 'Denies': -0.4215,
 'poorest': -0.5423,
 'cancer': -0.6597,
 'damage': -0.4939,
 'broke': -0.4215,
 ':(': -0.4404,
 'accident': -0.4767,
 'weak,': -0.4404,
 'losses': -0.4019,
 'damaged': -0.4404,
 'criticising': -0.4019,
 'cry': -0.4767,
 'threat': -0.5267,
 "avoidance'": -0.4019,
 'bad': -0.5423,
 'weake

In [57]:
negative_df = pd.DataFrame(list(neg_polarity_words.items()), columns=['Word', 'Score'])
ndf_sorted = negative_df.sort_values(by='Score', ascending=False)
ndf_sorted = ndf_sorted.reset_index(drop=True)

ndf_sorted

Unnamed: 0,Word,Score
0,problem?,-0.4019
1,Losses,-0.4019
2,disappoint,-0.4019
3,problems,-0.4019
4,problem,-0.4019
...,...,...
124,dead,-0.6486
125,Evil,-0.6597
126,killing,-0.6597
127,Cancer,-0.6597


In [58]:
for i in range(0,20):
    print(ndf_sorted['Word'][i])

problem?
Losses
disappoint
problems
problem
dumps
burdened
crash
losses
criticising
avoidance'
hurting
trouble
boycotting
lose
dispute
reject
worries
Fears
fails


In [60]:
for i in range(len(ndf_sorted)-20,len(ndf_sorted)):
    print(ndf_sorted['Word'][i])

crude
ruin
die
war
bullying
War
187
brutal.
worst
crisis
crisis,
disaster
Kia
doomed
hated,
dead
Evil
killing
Cancer
cancer


# Determining words with intense Positive polarity within the data

In [61]:
pos_df = scores_df.loc[scores_df['compound'] >= .4, :]
pos_df = pos_df.reset_index(drop=True)
pos_df

Unnamed: 0,neg,neu,pos,compound,id,Sentence,Sentiment
0,0.0,0.847,0.153,0.5423,0,The GeoSolutions technology will leverage Bene...,positive
1,0.0,0.896,0.104,0.4404,12,The subdivision made sales revenues last year ...,positive
2,0.0,0.647,0.353,0.5994,21,"Aviva, Friends Life top forecasts ahead of 5.6...",positive
3,0.0,0.804,0.196,0.5267,22,"In stead of being based on a soft drink , as i...",neutral
4,0.0,0.906,0.094,0.4588,28,The first installment of the Cinema Series con...,neutral
...,...,...,...,...,...,...,...
1520,0.0,0.927,0.073,0.4404,5827,"( ADPnews ) - Oct 21 , 2009 - Finland-based IT...",negative
1521,0.0,0.695,0.305,0.7096,5828,Since the registration of the shares subscribe...,neutral
1522,0.0,0.805,0.195,0.4404,5832,Operating profit fell to EUR 38.1 mn from EUR ...,negative
1523,0.0,0.824,0.176,0.4588,5840,"In the building and home improvement trade , s...",neutral


In [64]:
pos_polarity_words = {}

for i in range(0, len(pos_df)):
    sentence = pos_df['Sentence'][i]
    words = sentence.split()
    
    for word in words:
        word_polarity = sia.polarity_scores(word)
        polarity = word_polarity['compound'] 

        if(polarity >= .4):
            pos_polarity_words[word] = polarity
            
pos_polarity_words

{'powerful': 0.4215,
 'profits': 0.4404,
 'Friends': 0.4767,
 'awards': 0.4588,
 'welcome': 0.4588,
 'agreement': 0.4939,
 'pleased': 0.4404,
 'Profit': 0.4404,
 'profit': 0.4404,
 'pretty': 0.4939,
 'strengthened': 0.4215,
 'clearly': 0.4019,
 'improved': 0.4767,
 'DeLight': 0.5994,
 'favorable': 0.4767,
 'engaged': 0.4019,
 'kind': 0.5267,
 'treat': 0.4019,
 'perfect': 0.5719,
 'strengthening': 0.4939,
 'Loyal': 0.4767,
 'great': 0.6249,
 'good,': 0.4404,
 'great,': 0.6249,
 'good.': 0.4404,
 'confidence.': 0.5106,
 'glad': 0.4588,
 'loyal': 0.4767,
 'positive': 0.5574,
 'special': 0.4019,
 'nice': 0.4215,
 'good': 0.4404,
 'happy': 0.5719,
 'improvement': 0.4588,
 'thanks': 0.4404,
 'efficient': 0.4215,
 'successful': 0.5859,
 'gains!': 0.4003,
 'succeeded': 0.4215,
 'hand': 0.4939,
 'popular': 0.4215,
 'interest': 0.4588,
 'boost': 0.4019,
 'strong': 0.5106,
 'outstanding': 0.6124,
 'awarded': 0.4019,
 'honored': 0.5859,
 'supporting': 0.4404,
 'Positive': 0.5574,
 'Strongest': 0.4

In [66]:
positive_df = pd.DataFrame(list(pos_polarity_words.items()), columns=['Word', 'Score'])
pdf_sorted = positive_df.sort_values(by='Score', ascending=True)
pdf_sorted = pdf_sorted.reset_index(drop=True)

pdf_sorted

Unnamed: 0,Word,Score
0,gains!,0.4003
1,special,0.4019
2,Active,0.4019
3,justified,0.4019
4,warmly,0.4019
...,...,...
265,Love,0.6369
266,love,0.6369
267,best,0.6369
268,greatest,0.6369


In [67]:
for i in range(len(pdf_sorted)-20,len(pdf_sorted)):
    print(pdf_sorted['Word'][i])

successful
winner
amazing
miracle.
amazing.
DeLight
SUPER
outstanding
excellently
great
Excellence
Great
great,
excellence
Best
Love
love
best
greatest
'best
