In [3]:
from src.spark import Spark
import pandas as pd
import src.tweet_volume as funcs
import matplotlib.pyplot as plt
from pyspark.sql import functions as fs
from src.plotting import double_plot
from pyspark.ml import PipelineModel
from pyspark.ml.classification import LogisticRegressionModel
import src.nlp.clean as clean
from pyspark.sql.types import StringType, IntegerType, StructField, StructType
from src.nlp.sentiment import SentimentAnalyser


In [4]:
spark = Spark('load', 'local[4]')
sess = spark.session()

In [5]:
df = funcs.load_dataframe(sess, '/cs/unique/ls99-kf39-cs5052/data/tweets/*.json', funcs.schema)
df2 = funcs.parse_timestamp(df)


In [5]:
sent_schema = StructType([
    StructField("target", IntegerType(), True),
    StructField("text", StringType(), True)
])

train_df = sess.read.csv("/cs/unique/ls99-kf39-cs5052/train/train.csv", header=True, schema=sent_schema)

train_clean = clean.clean_tweets(train_df, "text")
train_clean.first()

Row(target=0, text=u'is so sad for my apl friend')

In [6]:
train, evaluation = train_clean.randomSplit([0.99, 0.1], seed=42)
print("Number of tweets in train: {:,}".format(train.count()))

Number of tweets in train: 1,434,200


In [6]:
pipeline = "/cs/unique/ls99-kf39-cs5052/models/pipeline"
classifier = "/cs/unique/ls99-kf39-cs5052/models/linreg"

sentiment_model = SentimentAnalyser()
sentiment_model.load(pipeline, classifier)

In [7]:
alts = ['BTC', 'XMR', 'DASH', 'LTC', 'ETC', 'BCH']  # 'ETH'
hashes = {
        'ETH': ['%ethereum%', '%ether%', '%eth%'],
        'BTC': ['%bitcoin%', '%btc%', '%bitcoin%'],
        'XMR': ["%monero%", "%xmr%", "%monero%"], 
        'DASH': ["%digital cash%", "%dash%", "%dash%"], 
        'LTC': ["%litecoin%", "%ltc%", "%litecoin%"], 
        'ETC': ["%ethereum classic%", "%etc%", "%eth classic%"], 
        'BCH': ["%bitcoincash%", "%bch%", "%bitcoin cash%"]
    }

crypto_data = funcs.load_crypto()


Loading KRAKEN_DASH_EUR.csv
Loading KRAKEN_LTC_EUR.csv
Loading KRAKEN_BTC_EUR.csv


Loading KRAKEN_ETC_EUR.csv
Loading KRAKEN_BCH_EUR.csv
Loading KRAKEN_XMR_EUR.csv
Loading KRAKEN_ETH_EUR.csv


In [8]:

for coin in alts:
    hash = hashes[coin]
    coin_df = df2.filter(fs.lower(df['text']).like(hash[0]) | fs.lower(df['text']).like(hash[1]) | fs.lower(df['text']).like(hash[2]))
    coin_clean = clean.clean_tweets(coin_df, "text")
    print("Looking at %s" % coin)
    print(coin_clean.first().text)
    
    coin_pred = sentiment_model.predict(coin_clean)
    pos, neg = sentiment_model.count_sentiments(coin_pred)
    
    daily_pos = funcs.aggregate_by_day(pos)
    daily_neg = funcs.aggregate_by_day(neg)
    daily_pos.set_index("date", inplace=True)
    daily_neg.set_index("date", inplace=True)
    daily_pos.index = pd.to_datetime(daily_pos.index)
    daily_neg.index = pd.to_datetime(daily_neg.index)
    
    daily_sent = pd.DataFrame(data={'pos':daily_pos['count'], 'neg':daily_neg['count']}, index=daily_pos.index)
    daily_sent['pos'] = daily_sent['pos'].astype(float)
    daily_sent['neg'] = daily_sent['neg'].astype(float)
    daily_sent['sentiment'] = (daily_sent['pos'] - daily_sent['neg']) / (daily_sent['pos'] + daily_sent['neg'])
    
    # Fetch coin price data, combine with sentiment score and plot.
    coin_df = crypto_data[coin]
    sent_price = coin_df
    sent_price['price'] = sent_price.weightedAverage
    sent_price['return'] = sent_price.price.pct_change()
    sent_price['sentiment'] = daily_sent['sentiment']
    sent_price.dropna(inplace=True)
    
    double_plot([sent_price.price, sent_price.sentiment], 
                ['%s Price' % coin, 'Sentiment'], 
                ['Date', 'Price', 'Sentiment Score'], 
                "%s Price vs Tweet Sentiment" % coin, 
                sent_price.index.tolist())
    double_plot([sent_price['return'], sent_price.sentiment], 
                ['%s Return' % coin, 'Sentiment'], 
                ['Date', 'Price', 'Sentiment Score'], 
                "%s Return vs Tweet Sentiment" % coin, 
                sent_price.index.tolist())
    print("Sentiment-Price correlation: %.4f" % sent_price.corr().sentiment.price)
    print("Sentiment-Return correlation: %.4f" % sent_price.corr()['sentiment']['return'])
    


Looking at BTC


rt @vinnylingham2x #bitcoin needs less developers and more incumbents and intermediaries 


Tweets positive: 2,753,006, negative: 403,501


Sentiment-Price correlation: -0.1724
Sentiment-Return correlation: -0.5240
Looking at XMR


rt @andrew0hayes genesis mining discountnpj8st#btc #ltc #eth #xmr #dash #str #gnt #dgb #stratis #xlm #eos #iota…


Tweets positive: 34,201, negative: 8,696


Sentiment-Price correlation: 0.8958
Sentiment-Return correlation: 0.2364
Looking at DASH


rt @ken2020n 1027 あさイチ二宮和也1027 ロンドンハーツ2時間sp二宮和也1028 ラストレシピ公開直前！絶品グルメ打ち上げツアー二宮和也1029 鉄腕dash二宮和也112 アメトーーク二宮和也1112 相葉マナ…


Tweets positive: 451,318, negative: 116,419


Sentiment-Price correlation: -0.3727
Sentiment-Return correlation: -0.6164
Looking at LTC


rt @andrew0hayes genesis mining discountnpj8st#btc #ltc #eth #xmr #dash #str #gnt #dgb #stratis #xlm #eos #iota…


Tweets positive: 214,838, negative: 37,729


Sentiment-Price correlation: 0.4713
Sentiment-Return correlation: -0.0315
Looking at ETC


rt @andrew0hayes #genesismining code npj8st $etc $xmr #litecoin $ltc $steem $zec $dash $btc $eth $rep #eth… 


Tweets positive: 46,498, negative: 3,611


Sentiment-Price correlation: -0.2842
Sentiment-Return correlation: -0.5320
Looking at BCH


rt @lisaciarlone what the fork bitcoin cash community preps hard fork slated for november 13 


Tweets positive: 196,321, negative: 61,832


Sentiment-Price correlation: 0.5710
Sentiment-Return correlation: -0.1822


In [18]:
daily_sent = pd.DataFrame(data={'pos':daily_pos['count'], 'neg':daily_neg['count']}, index=daily_pos.index)
daily_sent['pos'] = daily_sent['pos'].astype(float)
daily_sent['neg'] = daily_sent['neg'].astype(float)
daily_sent

Sentiment-Price correlation: -0.0452


In [24]:

daily_sent['sentiment'] = (daily_sent['pos'] - daily_sent['neg']) / (daily_sent['pos'] + daily_sent['neg'])
daily_sent

Sentiment-Price correlation: -0.0452


In [26]:

# Fetch coin price data, combine with sentiment score and plot.
coin_df = crypto_data[coin]
sent_price = coin_df
sent_price['price'] = sent_price.weightedAverage
sent_price['return'] = sent_price.price.pct_change()
sent_price['sentiment'] = daily_sent['sentiment']
sent_price.dropna(inplace=True)

double_plot([sent_price['return'], sent_price.sentiment], 
            ['%s Price' % coin, 'Sentiment'], 
            ['Date', 'Price', 'Sentiment Score'], 
            "%s Price vs Tweet Sentiment" % coin, 
            sent_price.index.tolist())
print("Sentiment-Price correlation: %.4f" % sent_price.corr()['sentiment']['return'])


Sentiment-Price correlation: -0.0452
