# Analysis of Reddit's Sentiment towards Organizations

In [7]:
import spacy
import pandas as pd

In [2]:
FILE_NAME = "reddit_investing.csv"

In [9]:
# download and initialize the model
!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 4.5 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [16]:
df = pd.read_csv(FILE_NAME, sep="|")
df.head()

Unnamed: 0,created_utc,downs,id,score,selftext,subreddit,title,ups,upvote_ratio
0,1614290000.0,0.0,t3_lshtjn,10.0,Bloomberg article: [https://www.bloomberg.com/...,investing,Fed Views Rising Yields as Bullish Sign Reflec...,10.0,0.86
1,1614286000.0,0.0,t3_lsgahw,56.0,Given the recent downturn in stocks especially...,investing,ARK ETFs implosion risk ------------------------,56.0,0.83
2,1614283000.0,0.0,t3_lsf8td,1.0,[https://twitter.com/desogames/status/13649710...,investing,The Counter-Party Risk Bubble,1.0,0.53
3,1614282000.0,0.0,t3_lsf3nh,6.0,"When you think of futures, what comes to your ...",investing,Futures were made for days like these,6.0,0.62
4,1614278000.0,0.0,t3_lsdcib,3.0,I've been on this sub for quite some time and ...,investing,Let's talk about liquidity premiums,3.0,0.67


## Extract organization entities from each record

In [15]:
def get_orgs(txt):
    '''function to extract ORG entities from text'''
    docs = nlp(txt)
    org_names = []
    for entity in docs.ents:
        if entity.label_ == 'ORG':
            org_names.append(entity.text)
    # remove duplicates
    org_names = list(set(org_names))
    return org_names

In [17]:
# apply the function to all rows in the DataFrame
df['organizations'] = df['selftext'].apply(get_orgs)

In [18]:
df.head()

Unnamed: 0,created_utc,downs,id,score,selftext,subreddit,title,ups,upvote_ratio,organizations
0,1614290000.0,0.0,t3_lshtjn,10.0,Bloomberg article: [https://www.bloomberg.com/...,investing,Fed Views Rising Yields as Bullish Sign Reflec...,10.0,0.86,"[the Atlanta Fed, Raphael Bostic, Bostic, Stro..."
1,1614286000.0,0.0,t3_lsgahw,56.0,Given the recent downturn in stocks especially...,investing,ARK ETFs implosion risk ------------------------,56.0,0.83,"[Bear, ARK]"
2,1614283000.0,0.0,t3_lsf8td,1.0,[https://twitter.com/desogames/status/13649710...,investing,The Counter-Party Risk Bubble,1.0,0.53,"[ITM, Citadel, RH]"
3,1614282000.0,0.0,t3_lsf3nh,6.0,"When you think of futures, what comes to your ...",investing,Futures were made for days like these,6.0,0.62,[NQ]
4,1614278000.0,0.0,t3_lsdcib,3.0,I've been on this sub for quite some time and ...,investing,Let's talk about liquidity premiums,3.0,0.67,[]


### Let's find out which organizations were most frequently mentioned

In [30]:
from collections import Counter

In [31]:
def frequency_orgs(organizations):
    # get all occurrences of organizations
    orgs = []
    for org in df['organizations']:
        orgs.extend(org)
    org_freq = Counter(orgs)
    return org_freq

In [32]:
# let's get the 10 most frequently mentioned organizations
org_freq = frequency_orgs(df['organizations'])
org_freq.most_common(10)

[('EV', 48),
 ('ETF', 45),
 ('COVID', 41),
 ('Apple', 27),
 ('GME', 26),
 ('Amazon', 26),
 ('NYSE', 23),
 ('EPS', 23),
 ('SEC', 23),
 ('Tesla', 22)]

We see that there are names like NYSE which are not the type of organizations we're looking for. We can put them in a blacklist and remove them.

### Entity Blacklist

In [33]:
blacklist = ['ev', 'etf', 'covid', 'nyse', 'eps', 'sec'] # we might have to include more later on

In [34]:
# let's modify the get_orgs function to remove blacklisted entities
def get_orgs(txt):
    '''function to extract ORG entities from text'''
    docs = nlp(txt)
    org_names = []
    for entity in docs.ents:
        if entity.label_ == 'ORG' and entity.text.lower() not in blacklist:
            org_names.append(entity.text)
    # remove duplicates
    org_names = list(set(org_names))
    return org_names

In [35]:
# apply the function to all rows in the DataFrame
df['organizations'] = df['selftext'].apply(get_orgs)

In [36]:
# let's get the 10 most frequently mentioned organizations
org_freq = frequency_orgs(df['organizations'])
org_freq.most_common(10)

[('Apple', 27),
 ('GME', 26),
 ('Amazon', 26),
 ('Tesla', 22),
 ('FDA', 19),
 ('NASDAQ', 17),
 ('CNBC', 16),
 ('Google', 15),
 ('TSLA', 15),
 ('EU', 15)]

## Let's get the sentiment of each organization

### Initially, get sentiment of each row

In [38]:
try:
    import flair
except:
    !pip install flair
    import flair

Collecting flair
  Downloading flair-0.10-py3-none-any.whl (322 kB)
[K     |████████████████████████████████| 322 kB 5.1 MB/s 
[?25hCollecting mpld3==0.3
  Downloading mpld3-0.3.tar.gz (788 kB)
[K     |████████████████████████████████| 788 kB 42.8 MB/s 
[?25hCollecting more-itertools~=8.8.0
  Downloading more_itertools-8.8.0-py3-none-any.whl (48 kB)
[K     |████████████████████████████████| 48 kB 4.2 MB/s 
Collecting sentencepiece==0.1.95
  Downloading sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 38.5 MB/s 
[?25hCollecting konoha<5.0.0,>=4.0.0
  Downloading konoha-4.6.5-py3-none-any.whl (20 kB)
Collecting transformers>=4.0.0
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 42.8 MB/s 
[?25hCollecting deprecated>=1.2.4
  Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Collecting gdown==3.12.2
  Downloading gdown-3.12.2.tar.gz (8.2 kB)
  In

In [39]:
model = flair.models.TextClassifier.load('en-sentiment')

2021-12-02 18:22:59,563 https://nlp.informatik.hu-berlin.de/resources/models/sentiment-curated-distilbert/sentiment-en-mix-distillbert_4.pt not found in cache, downloading to /tmp/tmp4sigbps5


100%|██████████| 265512723/265512723 [00:13<00:00, 19215682.53B/s]

2021-12-02 18:23:13,730 copying /tmp/tmp4sigbps5 to cache at /root/.flair/models/sentiment-en-mix-distillbert_4.pt





2021-12-02 18:23:14,732 removing temp file /tmp/tmp4sigbps5
2021-12-02 18:23:14,774 loading file /root/.flair/models/sentiment-en-mix-distillbert_4.pt


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [40]:
def get_sentiment(text):
    sentence = flair.data.Sentence(text)
    model.predict(sentence)
    sentiment = sentence.labels[0]
    return sentiment

In [41]:
# get sentiment for all rows
df['sentiment'] = df['selftext'].apply(get_sentiment)
df.head()

Unnamed: 0,created_utc,downs,id,score,selftext,subreddit,title,ups,upvote_ratio,organizations,sentiment
0,1614290000.0,0.0,t3_lshtjn,10.0,Bloomberg article: [https://www.bloomberg.com/...,investing,Fed Views Rising Yields as Bullish Sign Reflec...,10.0,0.86,"[the Atlanta Fed, Raphael Bostic, Bostic, Stro...",NEGATIVE (0.9916)
1,1614286000.0,0.0,t3_lsgahw,56.0,Given the recent downturn in stocks especially...,investing,ARK ETFs implosion risk ------------------------,56.0,0.83,"[Bear, ARK]",NEGATIVE (0.9975)
2,1614283000.0,0.0,t3_lsf8td,1.0,[https://twitter.com/desogames/status/13649710...,investing,The Counter-Party Risk Bubble,1.0,0.53,"[ITM, Citadel, RH]",NEGATIVE (0.9996)
3,1614282000.0,0.0,t3_lsf3nh,6.0,"When you think of futures, what comes to your ...",investing,Futures were made for days like these,6.0,0.62,[NQ],NEGATIVE (0.9999)
4,1614278000.0,0.0,t3_lsdcib,3.0,I've been on this sub for quite some time and ...,investing,Let's talk about liquidity premiums,3.0,0.67,[],NEGATIVE (0.9893)


In [53]:
# make a dictionary of organization names and sentiment
sentiment = {}

for i, row in df.iterrows():
    # extract sentiment category and score
    category, score = row['sentiment'].value, row['sentiment'].score
    # append score to respective category for the organization
    for org in row['organizations']:
        # if organization is not present in dictionary, create an entry
        if org not in sentiment.keys():
            sentiment[org] = {'POSITIVE': [], 'NEGATIVE': []}
        sentiment[org][category].append(score)

In [54]:
sentiment['Tesla']

{'NEGATIVE': [0.9996938705444336,
  0.9964684247970581,
  0.9997310042381287,
  0.9901899695396423,
  0.9991249442100525,
  0.9975064396858215,
  0.9982183575630188,
  0.9990321397781372,
  0.9620267748832703,
  0.9994062185287476,
  0.6798821687698364,
  0.9977540373802185,
  0.9998772144317627,
  0.9982232451438904,
  0.9976763129234314],
 'POSITIVE': [0.9905288815498352,
  0.9774701595306396,
  0.9964744448661804,
  0.9707769751548767,
  0.9869654178619385,
  0.8813272714614868,
  0.9996768236160278]}

Let's calculate the average positive, average negative and overall score for each organization

In [55]:
avg_sentiment = []

# calculate the scores for each organization
for org in sentiment.keys():
    num_occurrence = 0
    for category in ['POSITIVE', 'NEGATIVE']:
        num_occurrence += len(sentiment[org][category])
        # calculate total score for the organization within the category and assign back
        scores = sentiment[org][category]
        sentiment[org][category] = 0.0 if len(scores) == 0 else sum(scores)
    # get average sentiment across both categories
    total_score = sentiment[org]['POSITIVE'] - sentiment[org]['NEGATIVE']
    avg_score = total_score/num_occurrence
    # append results to list
    avg_sentiment.append(
        {
            'organization': org,
            'positive': sentiment[org]['POSITIVE'], 
            'negative': sentiment[org]['NEGATIVE'],
            'num_occurrence': num_occurrence,
            'score': avg_score
        }
    )

In [57]:
avg_sentiment[0]

{'negative': 0.9916453957557678,
 'num_occurrence': 1,
 'organization': 'the Atlanta Fed',
 'positive': 0.0,
 'score': -0.9916453957557678}

In [58]:
# convert to df
sentiment_df = pd.DataFrame(avg_sentiment)

In [59]:
sentiment_df.head()

Unnamed: 0,organization,positive,negative,num_occurrence,score
0,the Atlanta Fed,0.0,0.991645,1,-0.991645
1,Raphael Bostic,0.0,0.991645,1,-0.991645
2,Bostic,0.0,0.991645,1,-0.991645
3,Strong Rebound \n&gt,0.0,0.991645,1,-0.991645
4,Bullard,0.0,0.991645,1,-0.991645


In [60]:
# let's sort the dataframe based on best scores
sentiment_df.sort_values('score', ascending=False).head(10)

Unnamed: 0,organization,positive,negative,num_occurrence,score
948,yahoo,0.999867,0.0,1,0.999867
2497,Stochastic,0.99983,0.0,1,0.99983
2493,UNIT,0.99983,0.0,1,0.99983
2498,Vortex,0.99983,0.0,1,0.99983
2496,https://tos.mx/VPuILrE](https://tos.mx/VPuILrE,0.99983,0.0,1,0.99983
2495,VVNT,0.99983,0.0,1,0.99983
2494,NUGT,0.99983,0.0,1,0.99983
2274,OpenFiber,0.99976,0.0,1,0.99976
2273,Telecom Italia,0.99976,0.0,1,0.99976
2276,FTTH,0.99976,0.0,1,0.99976


In [61]:
# let's filter out organizations that appear less than 4 times
sentiment_df = sentiment_df[sentiment_df['num_occurrence'] >= 4]

In [62]:
sentiment_df.sort_values('score', ascending=False).head(10)

Unnamed: 0,organization,positive,negative,num_occurrence,score
2457,Samsung,3.466472,0.0,4,0.866618
1371,Coronavirus,3.152049,0.0,4,0.788012
1464,IBM,2.965104,0.88397,4,0.520283
533,Company,3.487737,0.999457,5,0.497656
358,TAM,6.161797,1.88031,9,0.475721
2199,Sony,4.888052,1.970413,7,0.416805
531,Intel,4.226026,1.95394,7,0.324584
218,Google,7.704343,5.516068,15,0.145885
1562,Yahoo Finance,2.310286,1.608116,5,0.140434
1012,Verizon,3.618952,2.80027,7,0.116955
