In [1]:
%autosave 80

Autosaving every 80 seconds


In [52]:
import json
import string
import pandas as pd
from flair.models import TextClassifier
from flair.data import Sentence
from tqdm import tqdm
import re
import plotly.express as px

In [3]:
filename = 'result.json'
f = open(filename, encoding='utf-8')
raw_data = json.load(f)

In [4]:
# checking if message is in English or not
def isEnglish(s):
    return s.isascii()

In [42]:
date_list = []
messages = []
for i in tqdm(range(len(raw_data['messages']))):
    msg = str(raw_data['messages'][i]['text']).lower()
    if (isEnglish(msg)):
        if (('shib' in msg) or ('doge' in msg)):
            date_list.append(raw_data['messages'][i]['date'])
            messages.append(msg)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 49436/49436 [00:00<00:00, 760874.13it/s]


In [8]:
dates = []
for date in date_list:
    dates.append(date.split('T')[0])

In [10]:
# Pandas dataframe of date and corresponding message
df = pd.DataFrame(list(zip(dates, messages)), columns=['Date', 'Message'])

In [12]:
print(df.sample(10))

            Date                                            Message
152   2021-05-04                                      wil doge fall
207   2021-05-05  what is expected price for doge by end of this...
2399  2021-05-15  anybody suggest in shiba\nwhos the best in inv...
783   2021-05-08  [{'type': 'link', 'text': 'https://mobile.twit...
1806  2021-05-11             what i expect from dogecoin in 1 month
397   2021-05-07           dogelon mars will make me multimillionar
255   2021-05-06                                holding doge or not
650   2021-05-08         is shiba listed on app but you can't buy??
789   2021-05-08  still cant login to my account.....its so saad...
148   2021-05-04                          dogecoin will rise to 1$?


Preprocessing data by converting to lowercase, expanding contractions, removing non-alphabet characters

In [14]:
contractions = {"aint": "am not / are not", "arent": "am not", "cant": "cannot", "cantve": "cannot have", "cause": "because", "couldve": "could have", "couldnt": "could not", "couldntve": "could not have", "didnt": "did not", "doesnt": "does not", "dont": "do not", "hadnt": "had not", "hadntve": "had not have", "hasnt": "has not", "havent": "have not", "howd": "how did", "howdy": "how do you", "howll": "how will", "hows": "how is", "id": "I would", "idve": "I would have", "ill": "I will", "illve": "I will have", "im": "I am", "ive": "I have", "isnt": "is not", "itd": "it had", "itdve": "it would have", "itll": "it will", "itllve": "it will have", "its": "it is", "lets": "let us", "mightve": "might have", "mightnt": "might not", "mightntve": "might not have", "mustve": "must have", "mustnt": "must not", "mustntve": "must not have", "neednt": "need not", "needntve": "need not have", "shouldve": "should have", "shouldnt": "should not", "shouldntve": "should not have", "thatd": "that would",
"thatdve": "that would have", "thats": "that is", "thered": "there would", "theredve": "there would have", "theres": "there is", "theyd": "they had", "theydve": "they would have", "theyll": "they will", "theyllve": "they will have", "theyre": "they are", "theyve": "they have", "wasnt": "was not", "wed": "we had / we would", "wedve": "we would have", "weve": "we have", "werent": "were not", "whatll": "what will", "whatllve": "what will have","whatre": "what are", "whats": "what is", "whatve": "what have", "whens": "when is", "whered": "where did", "wheres": "where is", "whereve": "where have", "wholl": "who will", "whollve": "who will have", "whos": "who is", "whove": "who have", "whys": "why is", "whyve": "why have", "willve": "will have", "wont": "will not", "wontve": "will not have", "wouldve": "would have", "wouldnt": "would not", "wouldnt've": "would not have", "yall": "you all", "youd": "you would", "youdve": "you would have", "youll": "you will", "youlve": "you will have", "youre": "you are", "youve": "you have"}

In [15]:
def contractionfunction(row):
    text = row['message_3']
    for word in text.split():
        if word in contractions:
            text = text.replace(word, contractions[word])
    return text

In [16]:
df['message_2'] = df.Message.str.replace('[^a-zA-Z ]', '')
df['message_3'] = df['message_2'].replace('\s+', ' ', regex=True)
df['message_clean'] = df.apply(lambda row: contractionfunction(row), axis=1)
# stop = stopwords.words('english')
# df['review_clean_stopwords'] = data['review_clean'].apply(lambda s: ' '.join([word for word in s.split() if word not in (stop)]))

  df['message_2'] = df.Message.str.replace('[^a-zA-Z ]', '')


I will be using the off-the-shelf library Flair to predict sentiment of each message. It has a pre-trained Sentiment Analysis NLP model. It gives us a Positive/Negative label along with a corresponding confidence score of the prediction.

In [18]:
# loading pretrained model
sia = TextClassifier.load('en-sentiment')

2021-12-17 21:18:00,819 loading file C:\Users\jishn\.flair\models\sentiment-en-mix-distillbert_4.pt


In [20]:
# Predicting using the flair model
def flair_prediction(x):
    sentence = Sentence(x)
    sia.predict(sentence)
    score = sentence.labels[0]
    if "POSITIVE" in str(score):
        num = str(score).split()[1]
        num = re.sub('[()]', '', num)
        return (num, "pos")
    elif "NEGATIVE" in str(score):
        num = str(score).split()[1]
        num = re.sub('[()]', '', num)
        return (num, "neg")
    else:
        return "neu"

In [21]:
df["sentiment"] = df["message_clean"].apply(flair_prediction)

In [25]:
print(df[['Message', 'sentiment']].sample(10))

                                                Message      sentiment
358                                     will doge go up  (0.9945, pos)
2169                                 can shiba reach 1$  (0.9888, pos)
1468                      shiba crashing the app again?  (0.8462, neg)
1095  hi 17 hours ago i've buy 1500000 shiba inu but...  (0.9995, neg)
1522  hello i bought shib on saturday snd today and ...  (0.9996, neg)
485                     guys what you think about doge?  (0.9998, pos)
1279                          is shiba inu on exchange.  (0.8839, pos)
1200            same bought shiba inu no asset no money  (0.9985, neg)
650          is shiba listed on app but you can't buy??  (0.9963, neg)
2325                       is there any future for shib   (0.701, pos)


Generating plots of number of messages and average sentiment per day using Plotly 

In [29]:
counts = df.groupby('Date').count()['sentiment']

In [30]:
df_counts = pd.DataFrame({'Date':counts.index, 'Number_of_Messages':counts.values})

In [34]:
fig = px.line(df_counts, x=df_counts.Date, y=df_counts.Number_of_Messages, title='Number of Messages per day from May 1st to May 15th:')
fig.show()

In [35]:
df[['score', 'sentiment']] = pd.DataFrame(df['sentiment'].tolist(), index=df.index)

In [36]:
df = df.astype({'score':float})

In [37]:
df_grouped = df.groupby('Date')

In [38]:
daily_dates = []
avg_sentiment = []
for name, group in df_grouped:
    average = 0
    for row_index, row in group.iterrows():
        if row['sentiment'] == 'neg':
            average = average - row['score']
        else:
            average = average + row['score']
    daily_dates.append(name)
    avg_sentiment.append(average/len(group))

In [39]:
df_sentiment = pd.DataFrame(list(zip(daily_dates, avg_sentiment)), columns=['Date', 'Sentiment_Average'])

In [43]:
fig = px.line(df_sentiment, x=df_sentiment.Date, y=df_sentiment.Sentiment_Average, title='Average Sentiment per day from May 1st to May 15th:')
fig.show()