In [864]:
# Importing important modules and packages

import pandas as pd
import json
import numpy as np
import re
import warnings
from flair.models import TextClassifier
from flair.data import Sentence
import plotly.express as px
from tqdm._tqdm_notebook import tqdm_notebook
from tqdm import tqdm
warnings.filterwarnings("ignore")
tqdm_notebook.pandas()

<b>1. Data Collection</b>

In [751]:
# read data from json file
f = open('result.json', encoding='utf8')
data = json.load(f)

In [752]:
# load the messages into a data frame
msgs = data['messages']
df = pd.DataFrame(msgs)

In [753]:
df.head()

Unnamed: 0,id,type,date,from,from_id,text,reply_to_message_id,forwarded_from,actor,actor_id,...,message_id,file,thumbnail,media_type,sticker_emoji,contact_information,contact_vcard,mime_type,duration_seconds,via_bot
0,1903819,message,2021-05-01T00:00:12,,user1650688285,hi,,,,,...,,,,,,,,,,
1,1903842,message,2021-05-01T00:01:13,,user1650688285,do indicators work?,,,,,...,,,,,,,,,,
2,1903855,message,2021-05-01T00:01:50,Social Ch4in,user484605980,If you trade true gbp for gbp Fiat or vice Ver...,,,,,...,,,,,,,,,,
3,1903856,message,2021-05-01T00:02:05,Social Ch4in,user484605980,And how much is that fee?,,,,,...,,,,,,,,,,
4,1903857,message,2021-05-01T00:02:12,Social Ch4in,user484605980,If you’re a silver card holder,,,,,...,,,,,,,,,,


In [754]:
df.shape

(49436, 26)

- Number of text message: 49436
- Number of attributes/columns: 26

In [861]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49436 entries, 0 to 49435
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   49436 non-null  int64  
 1   type                 49436 non-null  object 
 2   date                 49436 non-null  object 
 3   from                 44854 non-null  object 
 4   from_id              49369 non-null  object 
 5   text                 49436 non-null  object 
 6   reply_to_message_id  24570 non-null  float64
 7   forwarded_from       155 non-null    object 
 8   actor                62 non-null     object 
 9   actor_id             67 non-null     object 
 10  action               67 non-null     object 
 11  members              43 non-null     object 
 12  edited               781 non-null    object 
 13  photo                91 non-null     object 
 14  width                141 non-null    float64
 15  height               141 non-null   

<b>2. Data Preprocessing</b>

In [755]:
# filter text and date column from the original dataframe
df_text = df.filter(items=['text', 'date'])

In [756]:
df_text.head()

Unnamed: 0,text,date
0,hi,2021-05-01T00:00:12
1,do indicators work?,2021-05-01T00:01:13
2,If you trade true gbp for gbp Fiat or vice Ver...,2021-05-01T00:01:50
3,And how much is that fee?,2021-05-01T00:02:05
4,If you’re a silver card holder,2021-05-01T00:02:12


In [757]:
df_text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49436 entries, 0 to 49435
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    49436 non-null  object
 1   date    49436 non-null  object
dtypes: object(2)
memory usage: 772.6+ KB


In [758]:
# checking if the df has any null values
df_text.isnull().sum()

text    0
date    0
dtype: int64

In [759]:
# standardize datetime column
df_text['date'] = pd.to_datetime(df_text['date'])

In [760]:
# reference: https://stackoverflow.com/a/47091490/4084039
# decontracting some english words (won't -> will not, can't -> cannot)
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

<b>2.1 Remove non-English messages and perform some basic text preprocessing steps</b>

In [761]:
# some basic text preprocessing on the text data
# 1. filter non-english messages from the text messages
# 2. convert all words to lower case
# 3. decontraction

preprocessed_text = []
for i in tqdm(range(df_text.shape[0])):
    message = re.sub('[^A-Za-z]+', ' ', str(df_text['text'][i]))
    message = ''.join(message.lower())
    message = decontracted(message)
    preprocessed_text.append(message.strip())

100%|██████████| 49436/49436 [00:00<00:00, 63911.83it/s]


In [762]:
df_text['text'] = preprocessed_text

In [764]:
df_text.head()

Unnamed: 0,text,date
0,hi,2021-05-01 00:00:12
1,do indicators work,2021-05-01 00:01:13
2,if you trade true gbp for gbp fiat or vice ver...,2021-05-01 00:01:50
3,and how much is that fee,2021-05-01 00:02:05
4,if you re a silver card holder,2021-05-01 00:02:12


In [782]:
df_prep1 = df_text.copy()

In [783]:
# replace blank space with nan values
df_prep1.replace('', np.nan, inplace=True)

In [785]:
# check null values
df_prep1.isnull().sum()

text    1583
date       0
dtype: int64

In [786]:
# drop na values
df_prep1 = df_prep1.dropna()

In [787]:
df_prep1.shape

(47853, 2)

In [788]:
df_prep1.isnull().sum()

text    0
date    0
dtype: int64

In [789]:
df_prep1.isna().sum()

text    0
date    0
dtype: int64

<b>2.2 Keep only messages that mention either “SHIB” or “DOGE.</b>

In [791]:
def include_words(s):
    s_l = s['text']
    if 'shib' in s_l or 'doge' in s_l:
        return True
    else:
        return False

In [792]:
s_l = []
for i in tqdm(range(df_prep1.shape[0])):
    s_l.append(include_words(df_prep1.iloc[i]))

100%|██████████| 47853/47853 [00:03<00:00, 14871.32it/s]


In [793]:
df_prep1 = df_prep1.loc[s_l, ['text', 'date']]

In [794]:
df_prep1.head()

Unnamed: 0,text,date
63,doge is going cray,2021-05-01 00:56:05
66,sell target of doge,2021-05-01 00:57:07
68,doge,2021-05-01 00:58:11
72,dogecoin que hago,2021-05-01 00:59:29
74,anyway is doge a good crypto for long term inv...,2021-05-01 01:00:32


In [803]:
df_prep1.shape

(3061, 2)

In [804]:
df_sent = df_prep1.copy()

In [805]:
df_sent.head()

Unnamed: 0,text,date
63,doge is going cray,2021-05-01 00:56:05
66,sell target of doge,2021-05-01 00:57:07
68,doge,2021-05-01 00:58:11
72,dogecoin que hago,2021-05-01 00:59:29
74,anyway is doge a good crypto for long term inv...,2021-05-01 01:00:32


In [806]:
df_sent.shape

(3061, 2)

<b>3. Sentiment Analysis</b>

In [800]:
# load the pre-trained flair model
flair_sa = TextClassifier.load('en-sentiment')

2021-12-18 08:40:33,023 loading file /Users/amitsingh/.flair/models/sentiment-en-mix-distillbert_4.pt


In [807]:
# extracting sentiment score and the corresponding sentiment value for each message

s_score = []
s_score_norm = []
s_sentiment = []

for sentence in tqdm(df_sent["text"]):
    sentence = Sentence(sentence)
    flair_sa.predict(sentence)
    if sentence.labels[0].value.startswith("POS"):
        sentence.labels[0].score *= 1
        s_score_norm.append(1)
    else:
        sentence.labels[0].score *= -1
        s_score_norm.append(-1)
    s_score.append(sentence.labels[0].score)
    s_sentiment.append(sentence.labels[0].value)

100%|██████████| 3061/3061 [03:13<00:00, 15.80it/s]


In [808]:
df_sent['s_score'] = s_score
df_sent['s_score_norm'] = s_score_norm
df_sent['s_sentiment'] = s_sentiment

In [809]:
df_sent

Unnamed: 0,text,date,s_score,s_score_norm,s_sentiment
63,doge is going cray,2021-05-01 00:56:05,-0.998401,-1,NEGATIVE
66,sell target of doge,2021-05-01 00:57:07,-0.987667,-1,NEGATIVE
68,doge,2021-05-01 00:58:11,-0.712615,-1,NEGATIVE
72,dogecoin que hago,2021-05-01 00:59:29,-0.602185,-1,NEGATIVE
74,anyway is doge a good crypto for long term inv...,2021-05-01 01:00:32,-0.535006,-1,NEGATIVE
...,...,...,...,...,...
49376,how is shib,2021-05-15 23:28:18,-0.804514,-1,NEGATIVE
49384,how is shib,2021-05-15 23:31:14,-0.804514,-1,NEGATIVE
49390,shiba swap is gonna launch soon it will defini...,2021-05-15 23:32:55,0.794794,1,POSITIVE
49408,whole crypto market is being dominated by bear...,2021-05-15 23:38:34,-0.999967,-1,NEGATIVE


In [811]:
df_sent.isnull().sum()

text            0
date            0
s_score         0
s_score_norm    0
s_sentiment     0
dtype: int64

In [812]:
df_sent.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3061 entries, 63 to 49428
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   text          3061 non-null   object        
 1   date          3061 non-null   datetime64[ns]
 2   s_score       3061 non-null   float64       
 3   s_score_norm  3061 non-null   int64         
 4   s_sentiment   3061 non-null   object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 143.5+ KB


In [813]:
df_sent.describe()

Unnamed: 0,s_score,s_score_norm
count,3061.0,3061.0
mean,-0.300625,-0.31199
std,0.880799,0.950241
min,-0.999998,-1.0
25%,-0.998227,-1.0
50%,-0.922572,-1.0
75%,0.868488,1.0
max,0.999954,1.0


In [815]:
df_sent['s_score'].describe()

count    3061.000000
mean       -0.300625
std         0.880799
min        -0.999998
25%        -0.998227
50%        -0.922572
75%         0.868488
max         0.999954
Name: s_score, dtype: float64

In [863]:
df_sent['s_sentiment'].value_counts()

NEGATIVE    2008
POSITIVE    1053
Name: s_sentiment, dtype: int64

- Number of positive messages: 2008
- Number of negative messages: 1053

In [819]:
df_viz = df_sent.copy()

<b>4. Visualization</b>

In [820]:
df_viz.head()

Unnamed: 0,text,date,s_score,s_score_norm,s_sentiment
63,doge is going cray,2021-05-01 00:56:05,-0.998401,-1,NEGATIVE
66,sell target of doge,2021-05-01 00:57:07,-0.987667,-1,NEGATIVE
68,doge,2021-05-01 00:58:11,-0.712615,-1,NEGATIVE
72,dogecoin que hago,2021-05-01 00:59:29,-0.602185,-1,NEGATIVE
74,anyway is doge a good crypto for long term inv...,2021-05-01 01:00:32,-0.535006,-1,NEGATIVE


In [821]:
# adding a new column to contain the absolute sentiment score
df_viz['s_score_abs'] = df_viz['s_score'].abs()

In [823]:
# adding a new column to contain day
df_viz['day'] = df_viz['date'].dt.floor('d').astype('str')

In [824]:
df_viz.head()

Unnamed: 0,text,date,s_score,s_score_norm,s_sentiment,s_score_abs,day
63,doge is going cray,2021-05-01 00:56:05,-0.998401,-1,NEGATIVE,0.998401,2021-05-01
66,sell target of doge,2021-05-01 00:57:07,-0.987667,-1,NEGATIVE,0.987667,2021-05-01
68,doge,2021-05-01 00:58:11,-0.712615,-1,NEGATIVE,0.712615,2021-05-01
72,dogecoin que hago,2021-05-01 00:59:29,-0.602185,-1,NEGATIVE,0.602185,2021-05-01
74,anyway is doge a good crypto for long term inv...,2021-05-01 01:00:32,-0.535006,-1,NEGATIVE,0.535006,2021-05-01


In [825]:
df_viz['day'].unique()

array(['2021-05-01', '2021-05-02', '2021-05-03', '2021-05-04',
       '2021-05-05', '2021-05-06', '2021-05-07', '2021-05-08',
       '2021-05-09', '2021-05-10', '2021-05-11', '2021-05-12',
       '2021-05-13', '2021-05-14', '2021-05-15'], dtype=object)

<b> 4.1 Plot-1: Number of messages per day

In [860]:
fig = px.bar(df_viz, x='day',
             labels={
                     "day": "Day",
                     "count": "Number of messages"
             },
             title="Number of messages per day", height=600, width=800)
fig.update_traces(marker_color='green')
fig.show()

In [834]:
# number of message per day
df_msg_day = df_viz.groupby(df_viz['date'].
                            dt.floor('d')).size().reset_index(name='count')
print(df_msg_day)

         date  count
0  2021-05-01     48
1  2021-05-02     13
2  2021-05-03     29
3  2021-05-04    120
4  2021-05-05     96
5  2021-05-06     72
6  2021-05-07    135
7  2021-05-08    709
8  2021-05-09    274
9  2021-05-10    773
10 2021-05-11    249
11 2021-05-12    159
12 2021-05-13    198
13 2021-05-14    109
14 2021-05-15     77


<b> 4.2 Plot-2: Average sentiment per day

In [854]:
# group by day and sentiment value
df_avg = df_viz.groupby(['day', 's_sentiment']).s_score.mean()

In [855]:
df_avg

day         s_sentiment
2021-05-01  NEGATIVE      -0.888683
            POSITIVE       0.905570
2021-05-02  NEGATIVE      -0.863493
            POSITIVE       0.881267
2021-05-03  NEGATIVE      -0.939170
            POSITIVE       0.800103
2021-05-04  NEGATIVE      -0.882520
            POSITIVE       0.921779
2021-05-05  NEGATIVE      -0.881150
            POSITIVE       0.849102
2021-05-06  NEGATIVE      -0.928971
            POSITIVE       0.867700
2021-05-07  NEGATIVE      -0.906430
            POSITIVE       0.894220
2021-05-08  NEGATIVE      -0.942209
            POSITIVE       0.905271
2021-05-09  NEGATIVE      -0.938643
            POSITIVE       0.891866
2021-05-10  NEGATIVE      -0.953251
            POSITIVE       0.913839
2021-05-11  NEGATIVE      -0.922029
            POSITIVE       0.903521
2021-05-12  NEGATIVE      -0.912773
            POSITIVE       0.902480
2021-05-13  NEGATIVE      -0.913261
            POSITIVE       0.891222
2021-05-14  NEGATIVE      -0.932655
    

In [856]:
df_avg = df_avg.to_frame().reset_index()

In [858]:
fig = px.bar(df_avg, x="day", y='s_score', color='s_sentiment',
             labels={
                     "day": "Day",
                     "s_score": "Sentiment Score",
                     "s_sentiment": "Sentiment"
                 },
             title=" Average sentiment per day",
             barmode='group', height=400, width=1000)
fig.show()