In [1]:
import glob
import json
import pandas as pd
import numpy as np
import re
from datetime import datetime
import matplotlib.pyplot as plt

In [2]:
import warnings
import os
warnings.filterwarnings('ignore') # ignore deprecation warnings

In [3]:
def clean_mentions(x):
    x = re.sub(r'\w*@\w*', '', x)
    return x

In [13]:
tweets_dict = {}

# Read every *.jsons file in the ./tweet_jsons/ directory and copy them into a pandas dataframe
for filepath in glob.iglob('./tweet_jsons_initial/*.json'):
    tweets_df = pd.read_json(filepath, lines=True)
        
    if not tweets_df.empty:
        tweets_df["content"] = tweets_df["content"].astype(str)
        tweets_df["content"] = tweets_df["content"].apply(clean_mentions)
    
    tweets_dict[filepath[22:-13]] = tweets_df
    
print(list(tweets_dict.keys()))

['barrysilbert', 'erikvoorhees', 'mskvsk', 'michael_saylor', 'Nickszabo4']


In [14]:
nodes = sorted(list(tweets_dict.keys()), key=str.casefold)
print(nodes)

['barrysilbert', 'erikvoorhees', 'michael_saylor', 'mskvsk', 'Nickszabo4']


In [15]:
len(nodes)

5

In [16]:
#Keyword filtering: only retain tweets related to cryptocurrency and remove empty lists

keywords = ["btc", "bitcoin", "XBT", "satoshi"]

for key in list(tweets_dict.keys()):
    #print(tweets_dict[key])
    if not tweets_dict[key].empty:
        tweets_dict[key] = tweets_dict[key][tweets_dict[key]["content"].str.contains('|'.join(keywords), na = False, case = False)]
    if tweets_dict[key].empty:
        del tweets_dict[key]

In [132]:
%store nodes
%store tweets_dict

Stored 'nodes' (list)
Stored 'tweets_dict' (dict)


In [17]:
tweets_dict['barrysilbert'].head()

Unnamed: 0,url,date,content,renderedContent,id,user,outlinks,tcooutlinks,replyCount,retweetCount,...,lang,source,sourceUrl,sourceLabel,media,retweetedTweet,quotedTweet,mentionedUsers,coordinates,place
1,https://twitter.com/BarrySilbert/status/129908...,2020-08-27 20:34:31+00:00,Digital Currency Group Enters the Bitcoin Mini...,Digital Currency Group Enters the Bitcoin Mini...,1299082889968132099,"{'username': 'BarrySilbert', 'displayname': 'B...",[https://decrypt.co/39953/digital-currency-gro...,[https://t.co/jhK2aDCMwZ],25,94,...,en,"<a href=""https://mobile.twitter.com"" rel=""nofo...",https://mobile.twitter.com,Twitter Web App,,,,"[{'username': 'decryptmedia', 'displayname': '...",,
3,https://twitter.com/BarrySilbert/status/129901...,2020-08-27 15:56:35+00:00,it would be a high class problem for us if a...,@adam3us @100trillionUSD it would be a high cl...,1299012944928419840,"{'username': 'BarrySilbert', 'displayname': 'B...",[],[],7,12,...,en,"<a href=""http://twitter.com/download/iphone"" r...",http://twitter.com/download/iphone,Twitter for iPhone,,,,"[{'username': 'adam3us', 'displayname': 'Adam ...",,
4,https://twitter.com/BarrySilbert/status/129897...,2020-08-27 13:15:01+00:00,There’s (digital) gold in them thar hills: Cry...,There’s (digital) gold in them thar hills: Cry...,1298972285680877573,"{'username': 'BarrySilbert', 'displayname': 'B...",[https://fortune.com/2020/08/27/bitcoin-mining...,[https://t.co/JpnhHR2KKv],13,71,...,en,"<a href=""https://mobile.twitter.com"" rel=""nofo...",https://mobile.twitter.com,Twitter Web App,,,,"[{'username': 'FortuneMagazine', 'displayname'...",,
6,https://twitter.com/BarrySilbert/status/129897...,2020-08-27 13:09:58+00:00,"Unveiling our 4th subsidiary, Foundry, a bitco...","Unveiling our 4th subsidiary, Foundry, a bitco...",1298971013821222912,"{'username': 'BarrySilbert', 'displayname': 'B...",[https://twitter.com/DCGco/status/129897037301...,[https://t.co/NJzfJdKNv1],63,130,...,en,"<a href=""https://mobile.twitter.com"" rel=""nofo...",https://mobile.twitter.com,Twitter Web App,,,{'url': 'https://twitter.com/DCGco/status/1298...,,,
17,https://twitter.com/BarrySilbert/status/129394...,2020-08-13 16:18:48+00:00,Interest in Grayscale Crypto Products Not Easi...,Interest in Grayscale Crypto Products Not Easi...,1293945106937020417,"{'username': 'BarrySilbert', 'displayname': 'B...",[https://cointelegraph.com/news/interest-in-gr...,[https://t.co/6eq91DVeJx],20,57,...,en,"<a href=""https://mobile.twitter.com"" rel=""nofo...",https://mobile.twitter.com,Twitter Web App,,,,"[{'username': 'Cointelegraph', 'displayname': ...",,


In [18]:
def clean(raw):
    """ Remove hyperlinks and markup """
    result = re.sub(r'http\S+', '', raw)
    result = re.sub('&gt;', "", result)
    result = re.sub('&#x27;', "'", result)
    result = re.sub('&quot;', '"', result)
    result = re.sub('&#x2F;', ' ', result)
    result = re.sub('<p>', ' ', result)
    result = re.sub('</i>', '', result)
    result = re.sub('&#62;', '', result)
    result = re.sub('<i>', ' ', result)
    result = re.sub('\n', '', result)
    result = result.strip()

    return result

In [19]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

def nltk(text):
    score = analyzer.polarity_scores(text)
    return score

In [20]:
from textblob import TextBlob
from textblob.sentiments import PatternAnalyzer # pattern analyzer is the defaultfrom textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer

def textblob(text):
    blob = TextBlob(text)
    return blob.sentiment

In [29]:
def date_to_week(date):
    week = date.isocalendar()[1]
    week = (week + 52 - 20) % 52
    return week

In [30]:
for name in nodes:
    data = tweets_dict[name]
    tweets = pd.DataFrame()
    tweets['date'] = data['date'].apply(lambda x: str(x)[:10])
    tweets['date'] = pd.to_datetime(tweets['date'])
    tweets['week'] = tweets['date'].apply(date_to_week)
    tweets['tweets'] = data['content']
    tweets['tweets'] = tweets['tweets'].astype(str)
    tweets['processed'] = tweets['tweets'].apply(clean)
    
    # nlp packages
    # tweets['flair'] = tweets['tweets'].apply(lambda x: flair(x))
    tweets['nltk'] = tweets['tweets'].apply(nltk)
    tweets['textblob'] = tweets['tweets'].apply(textblob)
    
    tweets['nltk-compound'] = tweets['nltk'].apply(lambda x: x['compound'])
    tweets['nltk-pos'] = tweets['nltk'].apply(lambda x: x['pos'])
    tweets['nltk-neg'] = tweets['nltk'].apply(lambda x: x['neg'])
    tweets['tb-polarity'] = tweets['textblob'].apply(lambda x: x[0])
    tweets['tb-subjectivity'] = tweets['textblob'].apply(lambda x: x[1])
    tweets = tweets.drop(columns = ['nltk', 'textblob'])
    tweets = tweets.iloc[::-1]
    
    # get everyone's initial opinion
    
    
    with open('processed_initial/{}.csv'.format(name + '_processed'),'w') as f:
        tweets.to_csv(f, index = False)

In [36]:
initial_op = pd.DataFrame(columns = ['name', 'opinion'])

In [47]:
for i, name in enumerate(nodes):
    data = pd.read_csv('processed_initial/{}.csv'.format(name + '_processed'))
    data = data.drop(columns = ['tweets', 'processed'])
    data = data.groupby('week').mean()
    initial_op.loc[i] = [name, data.iloc[-1]['nltk-compound']]

In [43]:
data

Unnamed: 0_level_0,nltk-compound,nltk-pos,nltk-neg,tb-polarity,tb-subjectivity
week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10,0.614433,0.129,0.0,0.217323,0.60404
11,0.2942,0.318,0.0,0.0,0.0
12,0.03555,0.0695,0.055,0.214286,0.578571
13,-0.3612,0.0,0.185,0.08,0.32
14,-0.0535,0.157667,0.213333,-0.034444,0.507963
15,0.0,0.0,0.0,-0.026786,0.126786


In [54]:
initial_op.to_csv('initial_na.csv', index = False)

In [55]:
initial_op

Unnamed: 0,name,opinion
0,barrysilbert,0.046
1,erikvoorhees,0.4658
2,michael_saylor,0.6124
3,mskvsk,0.34
4,Nickszabo4,0.0
