# Sentiment Analysis of Financial Text

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import nltk

import matplotlib.pyplot as plt
plt.style.use('ggplot')

df = pd.read_csv(Path('input/fin-sent-data.csv'))
df

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
...,...,...
5837,RISING costs have forced packaging producer Hu...,negative
5838,Nordic Walking was first used as a summer trai...,neutral
5839,"According shipping company Viking Line , the E...",neutral
5840,"In the building and home improvement trade , s...",neutral


In [2]:
# General understanding of data distributino
df["Sentiment"].value_counts()

neutral     3130
positive    1852
negative     860
Name: Sentiment, dtype: int64

# Creating a dataframe providing polarity scores for sentences

In [3]:
#nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

In [4]:
# Run the polarity score on the entire dataset
df_length = len(df)
result = {}

for i in range(0, df_length):
    sentence = df['Sentence'][i]
    result[i] = sia.polarity_scores(sentence)

#adding scores to a dataframe (tranposing so it can be read properly)
scores_df = pd.DataFrame(result).T
scores_df

Unnamed: 0,neg,neu,pos,compound
0,0.000,0.847,0.153,0.5423
1,0.167,0.833,0.000,-0.2023
2,0.064,0.856,0.080,0.1531
3,0.000,1.000,0.000,0.0000
4,0.000,1.000,0.000,0.0000
...,...,...,...,...
5837,0.239,0.761,0.000,-0.5267
5838,0.000,1.000,0.000,0.0000
5839,0.000,0.870,0.130,0.2023
5840,0.000,0.824,0.176,0.4588


In [5]:
#adding a mergable column
scores_df['id'] = range(0, len(df))
df['id'] = range(0, len(df))
scores_df = scores_df.merge(df, how='left')
scores_df

Unnamed: 0,neg,neu,pos,compound,id,Sentence,Sentiment
0,0.000,0.847,0.153,0.5423,0,The GeoSolutions technology will leverage Bene...,positive
1,0.167,0.833,0.000,-0.2023,1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,0.064,0.856,0.080,0.1531,2,"For the last quarter of 2010 , Componenta 's n...",positive
3,0.000,1.000,0.000,0.0000,3,According to the Finnish-Russian Chamber of Co...,neutral
4,0.000,1.000,0.000,0.0000,4,The Swedish buyout firm has sold its remaining...,neutral
...,...,...,...,...,...,...,...
5837,0.239,0.761,0.000,-0.5267,5837,RISING costs have forced packaging producer Hu...,negative
5838,0.000,1.000,0.000,0.0000,5838,Nordic Walking was first used as a summer trai...,neutral
5839,0.000,0.870,0.130,0.2023,5839,"According shipping company Viking Line , the E...",neutral
5840,0.000,0.824,0.176,0.4588,5840,"In the building and home improvement trade , s...",neutral


# Determining potential subjects of the sentence

In [6]:
all_nouns = {}

for i in range(0, len(scores_df)):
    sentence = scores_df['Sentence'][i]
    tokens = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokens)
    
    nouns = []
    for token, pos in tagged:
        if pos.startswith('NN'):  # Check if POS tag starts with 'NN'
            nouns.append(token)
    all_nouns[i] = nouns

In [7]:
nouns_df = pd.DataFrame(list(all_nouns.items()), columns=['id', 'Subjects'])
scores_df = scores_df.merge(nouns_df, how='left')
scores_df

Unnamed: 0,neg,neu,pos,compound,id,Sentence,Sentiment,Subjects
0,0.000,0.847,0.153,0.5423,0,The GeoSolutions technology will leverage Bene...,positive,"[GeoSolutions, technology, Benefon, GPS, solut..."
1,0.167,0.833,0.000,-0.2023,1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative,"[ESI, lows, BK, possibility]"
2,0.064,0.856,0.080,0.1531,2,"For the last quarter of 2010 , Componenta 's n...",positive,"[quarter, Componenta, sales, EUR131m, EUR76m, ..."
3,0.000,1.000,0.000,0.0000,3,According to the Finnish-Russian Chamber of Co...,neutral,"[Chamber, Commerce, construction, companies, F..."
4,0.000,1.000,0.000,0.0000,4,The Swedish buyout firm has sold its remaining...,neutral,"[buyout, firm, stake, months, company, public,..."
...,...,...,...,...,...,...,...,...
5837,0.239,0.761,0.000,-0.5267,5837,RISING costs have forced packaging producer Hu...,negative,"[RISING, costs, packaging, producer, Huhtamaki..."
5838,0.000,1.000,0.000,0.0000,5838,Nordic Walking was first used as a summer trai...,neutral,"[Nordic, Walking, summer, training, method, cr..."
5839,0.000,0.870,0.130,0.2023,5839,"According shipping company Viking Line , the E...",neutral,"[company, Viking, Line, EU, decision, impact]"
5840,0.000,0.824,0.176,0.4588,5840,"In the building and home improvement trade , s...",neutral,"[building, home, improvement, trade, sales, %,..."


# Determining words with intense negative polarity within the data

In [8]:
#exlusively looking at polarities below -.4 to find the negatively intense words
neg_df = scores_df.loc[scores_df['compound'] <= -.4, :]
neg_df = neg_df.reset_index(drop=True)
neg_df

Unnamed: 0,neg,neu,pos,compound,id,Sentence,Sentiment,Subjects
0,0.231,0.655,0.114,-0.4019,11,$SAP Q1 disappoints as #software licenses down...,negative,"[SAP, Q1, disappoints, software, problem, Clou..."
1,0.147,0.853,0.000,-0.4767,27,"The five-storey , eco-efficient building will ...",neutral,"[building, floor, area, sq, m., apartments]"
2,0.319,0.563,0.117,-0.6486,39,$AAPL afternoon selloff as usual will be bruta...,negative,"[AAPL, afternoon, selloff, ton, money]"
3,0.097,0.903,0.000,-0.4019,41,Dolce & Gabbana has asked the European Union t...,negative,"[Dolce, Gabbana, European, Union, Marimekko, C..."
4,0.408,0.592,0.000,-0.7351,45,L&G still paying price for dividend cut during...,negative,"[L, G, price, dividend, cut, crisis, chief]"
...,...,...,...,...,...,...,...,...
300,0.220,0.780,0.000,-0.4767,5732,$NIHD insiders got this one wrong. Looking for...,negative,"[NIHD, insiders, wrong, bottom, Rsi]"
301,0.150,0.850,0.000,-0.4389,5778,Still short $LNG from $11.70 area...next stop ...,negative,"[LNG, area, stop, Someone, shs]"
302,0.311,0.547,0.142,-0.4754,5801,"At this growth rate , paying off the national ...",negative,"[growth, rate, debt]"
303,0.320,0.593,0.087,-0.7096,5802,"Reuters: Green Mountain revenue misses, shares...",negative,"[Reuters, Green, Mountain, revenue, misses, sh..."


In [9]:
neg_polarity_words = {}

for i in range(0, len(neg_df)):
    sentence = neg_df['Sentence'][i]
    words = sentence.split() #tokenizing
    
    for word in words:
        word_polarity = sia.polarity_scores(word)
        polarity = word_polarity['compound'] 

        if(polarity <= -.4):
            neg_polarity_words[word] = polarity
            


In [10]:
negative_df = pd.DataFrame(list(neg_polarity_words.items()), columns=['Word', 'Score'])
ndf_sorted = negative_df.sort_values(by='Score', ascending=False)
ndf_sorted = ndf_sorted.reset_index(drop=True)

ndf_sorted

Unnamed: 0,Word,Score
0,problem?,-0.4019
1,Losses,-0.4019
2,disappoint,-0.4019
3,problems,-0.4019
4,problem,-0.4019
...,...,...
124,dead,-0.6486
125,Evil,-0.6597
126,killing,-0.6597
127,Cancer,-0.6597


In [11]:
#phrases with relatively weaker polarity
for i in range(0,20):
    print(ndf_sorted['Word'][i])

problem?
Losses
disappoint
problems
problem
dumps
burdened
crash
losses
criticising
avoidance'
hurting
trouble
boycotting
lose
dispute
reject
worries
Fears
fails


In [12]:
#phrases with relatively Stronger polarity
for i in range(len(ndf_sorted)-20,len(ndf_sorted)):
    print(ndf_sorted['Word'][i])

crude
ruin
die
war
bullying
War
187
brutal.
worst
crisis
crisis,
disaster
Kia
doomed
hated,
dead
Evil
killing
Cancer
cancer


# Determining words with intense Positive polarity within the data

In [13]:
#exlusively looking at polarities above .4 to find the positively intense words
pos_df = scores_df.loc[scores_df['compound'] >= .4, :]
pos_df = pos_df.reset_index(drop=True)
pos_df

Unnamed: 0,neg,neu,pos,compound,id,Sentence,Sentiment,Subjects
0,0.0,0.847,0.153,0.5423,0,The GeoSolutions technology will leverage Bene...,positive,"[GeoSolutions, technology, Benefon, GPS, solut..."
1,0.0,0.896,0.104,0.4404,12,The subdivision made sales revenues last year ...,positive,"[subdivision, sales, revenues, year, EUR, EUR,..."
2,0.0,0.647,0.353,0.5994,21,"Aviva, Friends Life top forecasts ahead of 5.6...",positive,"[Aviva, Friends, Life, forecasts, pound, merger]"
3,0.0,0.804,0.196,0.5267,22,"In stead of being based on a soft drink , as i...",neutral,"[stead, drink, Teho, energy, drink, water]"
4,0.0,0.906,0.094,0.4588,28,The first installment of the Cinema Series con...,neutral,"[installment, Cinema, Series, profile, invento..."
...,...,...,...,...,...,...,...,...
1520,0.0,0.927,0.073,0.4404,5827,"( ADPnews ) - Oct 21 , 2009 - Finland-based IT...",negative,"[ADPnews, IT, consultancy, Tieto, Oyj, HEL, TI..."
1521,0.0,0.695,0.305,0.7096,5828,Since the registration of the shares subscribe...,neutral,"[registration, shares, share, issue, number, P..."
1522,0.0,0.805,0.195,0.4404,5832,Operating profit fell to EUR 38.1 mn from EUR ...,negative,"[Operating, profit, EUR, mn, EUR, mn]"
1523,0.0,0.824,0.176,0.4588,5840,"In the building and home improvement trade , s...",neutral,"[building, home, improvement, trade, sales, %,..."


In [14]:
pos_polarity_words = {}

for i in range(0, len(pos_df)):
    sentence = pos_df['Sentence'][i]
    words = sentence.split() #tokenizing
    
    for word in words:
        word_polarity = sia.polarity_scores(word)
        polarity = word_polarity['compound'] 
        if(polarity >= .4): 
            pos_polarity_words[word] = polarity
            


In [15]:
positive_df = pd.DataFrame(list(pos_polarity_words.items()), columns=['Word', 'Score'])
pdf_sorted = positive_df.sort_values(by='Score', ascending=True)
pdf_sorted = pdf_sorted.reset_index(drop=True)

pdf_sorted

Unnamed: 0,Word,Score
0,gains!,0.4003
1,special,0.4019
2,Active,0.4019
3,justified,0.4019
4,warmly,0.4019
...,...,...
265,Love,0.6369
266,love,0.6369
267,best,0.6369
268,greatest,0.6369


In [16]:
#phrases with relatively Stronger polarity
for i in range(len(pdf_sorted)-20,len(pdf_sorted)):
    print(pdf_sorted['Word'][i])

successful
winner
amazing
miracle.
amazing.
DeLight
SUPER
outstanding
excellently
great
Excellence
Great
great,
excellence
Best
Love
love
best
greatest
'best


In [17]:
#phrases with relatively weaker polarity
for i in range(0,20):
    print(pdf_sorted['Word'][i])

gains!
special
Active
justified
warmly
boost
awarded
promising.
wish
approves
liking
parties
support.
values
support
secured
Special
support,
Support
clarity
