# Sentiment Analysis (textacy)

Textacy Notebook

## Import Packages

In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook as notebook
import textacy
from collections import defaultdict
import re
from sklearn.feature_extraction import DictVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

sns.set(font_scale=1.5)
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

## Load the GGP data

In [2]:
df = pd.read_csv('../../../GA/Capstone/scrapes/large_scrape_29-Mar-22_.csv')

In [3]:
df.head()

Unnamed: 0,date,user,title,comment,opinion,price,comment_clean
0,7 Dec '21,hydrogen,RE: Hydro,Yes Sir.,No Opinion,14.05,yes sir
1,7 Dec '21,Sycho,Hydro,Lets keep your 12:35 post to ourselves and kee...,No Opinion,14.05,lets keep your post ourselves and keep realyou...
2,7 Dec '21,TomE,GGPSP,"So what happened to the ""Expect a sudden jump ...",No Opinion,14.05,what happened the expect sudden jump price its...
3,7 Dec '21,Sycho,RE: Personally in my opinion,"LOL, now that's more like it.",No Opinion,14.05,lol now thats more like
4,7 Dec '21,Shady69,RE: Personally in my opinion,Hydro it's so well put it could bring a tear t...,No Opinion,14.05,hydro its well put could bring tear glass eye ...


In [4]:
# check for any null rows
df[df.isnull().sum(axis=1)>0]

Unnamed: 0,date,user,title,comment,opinion,price,comment_clean
219,5 Dec '21,mickey1122,RE: WOH !,https://twitter.com/trader****ney/status/14676...,No Opinion,14.15,
220,5 Dec '21,mickey1122,WOH !,https://twitter.com/trader****ney/status/14676...,No Opinion,14.15,
312,4 Dec '21,TallChapJG,Gold shortage,https://kingworldnews.com/alert-we-are-now-see...,No Opinion,14.15,
360,3 Dec '21,Bamps21,RE: AMEC AWARDS,https://twitter.com/kristiebatten/status/14663...,Strong Buy,14.50,
375,3 Dec '21,TimberTrader,RE: Greatland Gold plc 38.5% potential upside ...,https://www.***************************/greatl...,No Opinion,14.60,
...,...,...,...,...,...,...,...
132348,27 Jan '22,Bamps21,Drill results,https://www.google.co.uk/search?q=jan+22+newcr...,Strong Buy,13.70,
132388,27 Jan '22,spoon_key,:),https://www.youtube.com/watch?v=z5OXON8vIaA,No Opinion,13.70,
132508,27 Jan '22,geejay13,RE: Live price please as stuck at work,1368,No Opinion,13.80,
132549,27 Jan '22,Hopefullygold,RE: Pivot point,https://globalarbitrationreview.com/guide/the-...,Strong Buy,13.90,


In [5]:
df.dropna(inplace=True)

## Pre-Processing

In [44]:
# create function to process comments, removing stopwords, punctuation and filters on word type tags
# function must take a list of strings (comments) as input
# output processed comment and tokenised comment

def process_comments(comments):

    nlp = textacy.load_spacy_lang('en_core_web_sm')
    processed_words = []
    tokenised_words = []
    
    for comment in nlp.pipe(comments, batch_size=200):
        tokens = [token 
                  for token in comment 
                  if token.is_stop == False
                  and token.pos_ in ['NOUN', 'ADJ', 'VERB', 'ADV']
                  and token.pos_ != 'PUNCT']
        doc_ = ''
        for token in tokens:
            doc_ += str(token) + ' '
            
        doc_ = doc_.strip()
        processed_words.append(doc_)
        tokenised_words.append(tokens)

    return processed_words, tokenised_words

In [63]:
# example
process_comments(df['comment_clean'][10:12])

(['boughtbought value', 'lies challenged right based think'],
 [[boughtbought, value], [lies, challenged, right, based, think]])

In [45]:
# run function on comments

processed_comment, tokenised_comment = process_comments(df['comment_clean'])

In [47]:
# put results into separate columns in dataframe

df['processed_comment'] = processed_comment
df['tokenised_comment'] = tokenised_comment

In [49]:
df.head(3)

Unnamed: 0,date,user,title,comment,opinion,price,comment_clean,processed_comment,tokenised_comment
0,7 Dec '21,hydrogen,RE: Hydro,Yes Sir.,No Opinion,14.05,yes sir,sir,[sir]
1,7 Dec '21,Sycho,Hydro,Lets keep your 12:35 post to ourselves and kee...,No Opinion,14.05,lets keep your post ourselves and keep realyou...,lets post realyour knowledge research good kno...,"[lets, post, realyour, knowledge, research, go..."
2,7 Dec '21,TomE,GGPSP,"So what happened to the ""Expect a sudden jump ...",No Opinion,14.05,what happened the expect sudden jump price its...,happened expect sudden jump price fallen start...,"[happened, expect, sudden, jump, price, fallen..."


## Sentiment Analysis using Sentiment Words csv

In [65]:
# load sentiment words .csv file
sen = pd.read_csv('resources/sentiment_words.csv')
sen.head(2)

Unnamed: 0,pos,word,pos_score,neg_score
0,adj,.22-caliber,0.0,0.0
1,adj,.22-calibre,0.0,0.0


In [66]:
# make pos columns caps to align with textacy libraries
sen.pos = sen.pos.map(lambda x: x.upper())
sen.pos.unique()

array(['ADJ', 'NOUN', 'ADV', 'VERB'], dtype=object)

In [56]:
# set up a default dictionary
sen_dict = defaultdict(dict)

# iterate through rows as tuples, populating dictionary with scores as above
for row in sen.itertuples():
    sen_dict[row.pos][row.word] = {'pos_score': row.pos_score, 'neg_score': row.neg_score}

In [58]:
# create function to take tokenised comment and derive an average positive and negative sentiment score

def scorer(tokens):
    
    pos_scores = []
    neg_scores = []
    
    for token in tokens:
        try:
            pos_scores.append(sen_dict[token.pos_][token.lemma_]['pos_score'])
            neg_scores.append(sen_dict[token.pos_][token.lemma_]['neg_score'])
        except:
            pass
        
    # set default value if no token found
    if len(pos_scores) == 0:
        pos_scores = [0.]
    if len(neg_scores) == 0:
        neg_scores = [0.]
        
    return [np.mean(pos_scores), np.mean(neg_scores)]

In [60]:
# apply scorer function to tokenised comment column

scores = df['tokenised_comment'].map(scorer)

In [61]:
# populate dataframe columns with average pos and neg scores

df['pos_score'] = scores.map(lambda x: x[0])
df['neg_score'] = scores.map(lambda x: x[1])

In [84]:
# create positive - negative scores for final sentiment measure

df['pos-neg_score'] = df['pos_score'] - df['neg_score']

In [86]:
df.sort_values('pos_score', ascending=False)

Unnamed: 0,date,user,title,comment,opinion,price,comment_clean,processed_comment,tokenised_comment,pos_score,neg_score,pos-neg_score
128114,18 Feb '22,JONNO100,RE: Take comfort,Zoros A VERY excellent post ATB Jonno,No Opinion,13.750,zoros very excellent post atb jonno,zoros excellent,"[zoros, excellent]",1.0,0.0,1.0
11950,7 Oct '21,GoldenI,RE: Nice to see a bit of levity,"19, excellent",No Opinion,18.150,excellent,excellent,[excellent],1.0,0.0,1.0
16000,9 Sep '21,Lyndon69,Mr burns quote,âââââEXCELLENTââââ,No Opinion,18.800,excellent,excellent,[excellent],1.0,0.0,1.0
78199,4 Nov '20,Soundman1,RE: Havieron to restore Newcrest's Telfer grea...,Excellent,No Opinion,21.375,excellent,excellent,[excellent],1.0,0.0,1.0
112759,27 Jun '20,mickey1122,RE: GGPHelp.co.uk FAQ,Excellent BR,No Opinion,12.000,excellent,excellent,[excellent],1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
123837,19 Mar '22,Aurora1,RE: New To Board,More GGP,No Opinion,15.740,more ggp,ggp,[ggp],0.0,0.0,0.0
71950,1 Dec '20,Stout,RE: GGPSP,MM's hunting for sellers!,Strong Buy,27.250,mms hunting for sellers,mms hunting sellers,"[mms, hunting, sellers]",0.0,0.0,0.0
71935,1 Dec '20,seen_it_done_it,RE: A blue finish is quite possible...,Yay!! hooray!!!!,No Opinion,27.250,yay hooray,,[],0.0,0.0,0.0
71929,1 Dec '20,Bancal,RE: Was yesterday’s RNS the first revelation o...,Thanks GGPThruandtru,No Opinion,27.500,thanks ggpthruandtru,thanks,[thanks],0.0,0.0,0.0


In [80]:
# view the top 10 comments for positive

for comment in df.sort_values('pos_score', ascending=False)['comment_clean'][0:10]:
    print(comment)
    print('============================================================\n')

zoros very excellent post atb jonno

excellent

excellent

excellent

excellent

excellent reposte jbgla

here you are none researchers have

this awesome

yes that was awesome

you were lucky bargin well done



In [83]:
# view the top 10 comments for negative score

for comment in df.sort_values('neg_score', ascending=False)['comment_clean'][0:10]:
    print(comment)
    print('============================================================\n')

indeed doggy and could get very messy lol

bads hithanks very much

abusive

unlucky unfortunately

stinks

outrageous

what are you worried about

stinks

the latter but have been chastised for askingsaying this since january

incorrect



In [85]:
# view the top 10 comments for sentiment score

for comment in df.sort_values('pos-neg_score', ascending=False)['comment_clean'][0:10]:
    print(comment)
    print('============================================================\n')

excellent

zoros very excellent post atb jonno

excellent

excellent

excellent

excellent reposte jbgla

here you are none researchers have

you lucky lucky bggers

you were lucky bargin well done

your lucky



Observation: Results are skewed by one word comment. Filtering out shorter comments may yield better results.

In [87]:
# create column of comment length

df['comment_length'] = df.tokenised_comment.map(lambda x: len(x))

In [90]:
# filter dataframe by comments of 10 words or more

df_filtered = df[df.comment_length > 10]

In [95]:
# view the top 10 comments for sentiment score in the filtered dataframe

for comment in df_filtered.sort_values('pos-neg_score', ascending=False)['comment_clean'][0:10]:
    print(comment)
    print('============================================================\n')

buy buy buy buy buy buy buy buy buy buy buy buy buy goodbye sorters dont cryfor those that dont remember mary hopkins lol atb

many was thinking another hav discovery scally etc would transformational for ggp when all along hav its own think ghs superlative amazing just perfect happy days gla

this has been example good manners good banter and excellent research thanks allmerry christmas all the ggp family and happy prosperous safe and healthy new yearcheers stellabob

are totally agree our assets are the best the business the moment mms are taking the but will come good time

bamps you deserve all the thanks you receive mate excellent poster whos posts always enjoy looking out for

patience koffee will see you holding very good investment indeedfundamentals are course excellent and augurs wellgood luckviking

excellent was the one trying stop her putting more lol wish hadnt told her more cautious now good luck all mums out there

beautiful hydro just beautiful all know how important f

In [94]:
# view the bottom 10 comments for sentiment score in the filtered dataframe

for comment in df_filtered.sort_values('pos-neg_score', ascending=True)['comment_clean'][0:10]:
    print(comment)
    print('============================================================\n')

exactly ttb they are clearly nasty gloating individuals who wants like thatwaste space and should just reported for the disruptive gloating attempts

tymers you have lot anger life cant that bad can laughter and light hearted banter cant bad thing

long term shareholder very disappointed you lot these emails are embarrassing and factually incorrect what were you thinking dreadful just dreadful

marky you deal incorrect statements guestimates doom misleading investorstruth something which not associated withhave nice dayviking

banging your head against brick wall unfortunately redirons with the low investor mentality means blame anyone they can its pathetic stupid naive yet understandable degree but inevitable embarrassing unfortunately

tymers need for your abusive post then making excuses and have walk need for another one your abusive replies thankstime log off the grumpies are awakesee tomorrow call that mondayatbtom

sorry pain can someone please provide link ggp telegram having p

### Using Vader Sentiment Analyzer

In [97]:
# instantiate the Vader Sentiment Analyzer

vader = SentimentIntensityAnalyzer()

Vader Sentiment Analyzer, developed by the MIT is a self contained sentiment analyser that is "specifically attuned to sentiments expressed in social media". The library built in functions will simplify the steps used above by taking a processed bit of text as an input and outputting a negative, neutral, positive and compound score as a dictionary object

Reference https://github.com/cjhutto/vaderSentiment

- The pos, neu, and neg scores are ratios for proportions of text that fall in each category (so these should all add up to be 1... or close to it with float operation).

- The compound score is computed by summing the valence scores of each word in the lexicon, adjusted according to the rules, and then normalized to be between -1 (most extreme negative) and +1 (most extreme positive).

The Github readme also gives the following thresholds that could be used for a classification.
- positive sentiment: compound score >= 0.05
- neutral sentiment: (compound score > -0.05) and (compound score < 0.05)
- negative sentiment: compound score <= -0.05

In [116]:
# taking a random comment from the full dataframe...
i = 105428
print('Cleaned Comment:')
print(df.comment_clean[i], '\n')
print('Processed Comment:')
print(df.processed_comment[i], '\n')

# and processing the scores:
print('VADER Polarity Scores:')
print(vader.polarity_scores(df.processed_comment[i]))

Cleaned Comment:
elise yes know what you mean and agree but just ignore them and try not let upset you 

Processed Comment:
elise know mean agree ignore try let upset 

VADER Polarity Scores:
{'neg': 0.405, 'neu': 0.397, 'pos': 0.198, 'compound': -0.3818}


In [130]:
# create series for each comments vader score dictionaries

vader_scores = df['processed_comment'].map(vader.polarity_scores)

In [140]:
# fit to dictionary vectorizer
dvec = DictVectorizer()

vader_scores = dvec.fit_transform(vader_scores)

In [147]:
# iterate through dvec features and add to main dataframe

for i, col in enumerate(dvec.feature_names_):
    df['vader_{}'.format(col)] = vader_scores[:, i].toarray().ravel()

In [150]:
df.head(2)

Unnamed: 0,date,user,title,comment,opinion,price,comment_clean,processed_comment,tokenised_comment,pos_score,neg_score,pos-neg_score,comment_length,vader_compound,vader_neg,vader_neu,vader_pos
0,7 Dec '21,hydrogen,RE: Hydro,Yes Sir.,No Opinion,14.05,yes sir,sir,[sir],0.0,0.0,0.0,1,0.0,0.0,1.0,0.0
1,7 Dec '21,Sycho,Hydro,Lets keep your 12:35 post to ourselves and kee...,No Opinion,14.05,lets keep your post ourselves and keep realyou...,lets post realyour knowledge research good kno...,"[lets, post, realyour, knowledge, research, go...",0.175012,0.036076,0.138936,21,0.926,0.067,0.426,0.508


### Output to csv...

In [149]:
# df.to_csv('../../../GA/Capstone/scrapes/sentiment_scores.csv', index=False)