In [108]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, array, explode, sum, count, lit
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from datetime import datetime

SYMBOLS_LIST = ['BTC.X', 'BSV.X', 'BCH.X', 'LTC.X', 'ETH.X', 'DOGE.X']

# NOTE: The environment needs to have scala installed for this to work
spark = SparkSession \
.builder \
.appName("myApp") \
.config("spark.mongodb.input.uri", "mongodb://127.0.0.1/cryptoracle") \
.config("spark.mongodb.input.collection", "messages") \
.config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.4.0') \
.getOrCreate()

messages_df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()

In [109]:
messages_df.count()

300

In [110]:
messages_df.first()

Row(_id=Row(oid='5e85e12a2dc1515fd0260a61'), body='$BTC.X BTC will hit 100k one day, no doubt about it.  The real question is how much purchasing power will the dollar lose by then', conversation=None, created_at='2020-03-25T00:31:19Z', entities=Row(chart=None, sentiment=Row(basic='Bullish')), filters=Row(day_counts=1, official_api=True), id=202506508, likes=None, links=None, mentioned_users=[], reshares=None, source=Row(id=2095, title='StockTwits For Android ', url='http://www.stocktwits.com/mobile'), symbols=[Row(aliases=['BTCUSD'], id=11418, is_following=False, symbol='BTC.X', title='Bitcoin BTC/USD', watchlist_count=157424)], user=Row(avatar_url='http://avatars.stocktwits.com/images/default_avatar_thumb.jpg', avatar_url_ssl='https://s3.amazonaws.com/st-avatars/images/default_avatar_thumb.jpg', classification=[], followers=1, following=0, id=3162884, ideas=86, identity='User', join_date='2020-03-08', like_count=42, name='gufyvyvyvyvyvtv', official=False, plus_tier='', premium_room='

### Create new dataframe with only required columns and filtered rows

In [111]:
messages_df = messages_df.select(messages_df['_id']['oid'].alias('_id'),
                   messages_df['body'],
                   messages_df['created_at'],
                   messages_df['entities']['sentiment']['basic'].alias('sentiment'),
                   messages_df['symbols']['symbol'].alias('symbols'),
                   messages_df['likes']['total'].alias('likes'),
                   messages_df['reshares']['reshared_count'].alias('reshares'))

def sum_interactions(likes, reshares):
    _sum = 1
    if likes:
        _sum += likes
    if reshares:
        _sum += reshares
    return _sum

def convert_sentiment(sentiment):
    if sentiment == "Bullish":
        return 2
    if sentiment == "Bearish":
        return 1
    return 0

def convert_date(dt):
    return datetime.strptime(dt, '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d-%H')

interactionUdf = udf(sum_interactions, IntegerType())
sentimentUdf = udf(convert_sentiment, IntegerType())
dateUdf = udf(convert_date, StringType())

messages_df = messages_df \
                .withColumn('interaction_count', interactionUdf('likes', 'reshares')) \
                .withColumn('sentiment', sentimentUdf('sentiment')) \
                .withColumn('created_at', dateUdf('created_at'))

In [112]:
messages_df.first()

Row(_id='5e85e12a2dc1515fd0260a61', body='$BTC.X BTC will hit 100k one day, no doubt about it.  The real question is how much purchasing power will the dollar lose by then', created_at='2020-03-25-00', sentiment=2, symbols=['BTC.X'], likes=None, reshares=None, interaction_count=1)

#### Distribution of sentiments

In [113]:
messages_df.groupBy('sentiment').count().collect()

[Row(sentiment=1, count=16),
 Row(sentiment=2, count=120),
 Row(sentiment=0, count=164)]

### Windowing


#### Unwrap array of symbols to new rows

In [114]:
# Since tweets can be attributed to more than one symbol, we unwrap the list into more rows
# this is okay as our final calculation will be grouped by symbol among other things

def weight_sentiment(sentiment, count):
    if sentiment == 0:
        return 0
    return sentiment * count

def handle_neutral_sentiment(sentiment, count):
    if sentiment == 0:
        return 0
    return count

weightedSentimentUdf = udf(weight_sentiment, IntegerType())
neutralSentimentUdf = udf(handle_neutral_sentiment, IntegerType())

messages_df = messages_df \
                .withColumn('weighted_sentiment', weightedSentimentUdf('sentiment', 'interaction_count')) \
                .withColumn('symbol', explode(messages_df['symbols']))

# do not consider interaction count for neutral sentiment
messages_df = messages_df.withColumn('interaction_count', neutralSentimentUdf('sentiment', 'interaction_count'))
# filter to only those symbols that we care about
messages_df = messages_df.where(messages_df['symbol'].isin(SYMBOLS_LIST))

In [115]:
messages_df.groupby('symbol').count().collect()

[Row(symbol='BCH.X', count=46),
 Row(symbol='BTC.X', count=122),
 Row(symbol='ETH.X', count=68),
 Row(symbol='DOGE.X', count=30),
 Row(symbol='BSV.X', count=38),
 Row(symbol='LTC.X', count=45)]

In [116]:
grouped_df = messages_df.groupby(['created_at', 'symbol']).agg(
    sum('interaction_count').alias('sum_interaction_count'),
    sum('weighted_sentiment').alias('sum_weighted_sentiment'),
    count('_id').alias('volume_tweets'))

grouped_df = grouped_df.withColumn('overall_sentiment', grouped_df['sum_weighted_sentiment'] / grouped_df['sum_interaction_count'])

In [117]:
gdf = grouped_df.toPandas()

In [118]:
gdf[gdf['created_at'] == '2020-03-24-23']

Unnamed: 0,created_at,symbol,sum_interaction_count,sum_weighted_sentiment,volume_tweets,overall_sentiment
33,2020-03-24-23,BSV.X,2,4,1,2.0
84,2020-03-24-23,BCH.X,25,26,5,1.04
138,2020-03-24-23,ETH.X,31,38,10,1.225806
143,2020-03-24-23,LTC.X,24,24,5,1.0
163,2020-03-24-23,BTC.X,32,40,17,1.25


In [119]:
grouped_df.groupby('symbol').count().collect()

[Row(symbol='BCH.X', count=28),
 Row(symbol='BTC.X', count=48),
 Row(symbol='ETH.X', count=29),
 Row(symbol='DOGE.X', count=22),
 Row(symbol='BSV.X', count=21),
 Row(symbol='LTC.X', count=22)]

## Historical Price Data

In [120]:
import pandas as pd 
from datetime import datetime
read_func = lambda x: spark.read.format('csv').load(x, header=True, inferSchema=True)

def format_date(date, sym):
    if sym in ['BCH.X', 'DOGE.X']:
        return datetime.strptime(date, '%Y-%m-%d %I-%p').strftime("%Y-%m-%d-%H")
    return datetime.strptime(date, '%m/%d/%y %H:%M').strftime("%Y-%m-%d-%H")
    

formatDateUdf = udf(format_date, StringType())

BTC_prices = read_func("data/gemini_BTCUSD_1hr.csv") 
ETH_prices = read_func("data/gemini_ETHUSD_1hr.csv") 
LTC_prices = read_func("data/gemini_LTCUSD_1hr.csv") 
BCH_prices = read_func("data/Bitbay_BCHUSD_1h.csv") 
DOGE_prices = read_func("data/Yobit_DOGERUR_1h.csv") 

price_df_lists = [
    ('BTC.X', BTC_prices),
    ('ETH.X', ETH_prices),
    ('LTC.X', LTC_prices),
    ('BCH.X', BCH_prices),
    ('DOGE.X', DOGE_prices)
]

#### 1. Datetime and Symbols

In [121]:
new_dfs = []
for sym, price_df in price_df_lists:
    new_dfs.append(price_df \
                .withColumnRenamed('Symbol', 'symbol') \
                .withColumn('symbol', lit(sym)) \
                .withColumn('Date', formatDateUdf('Date', 'symbol')) \
                .drop('Unix Timestamp'))
price_df_lists = new_dfs

#### 2. Doge (RUR to USD)

In [122]:
RUR_USD = 0.013

def convert_doge(price):
    return price * RUR_USD

convertPriceUdf = udf(convert_doge, DoubleType())

for col in ['Open', 'High', 'Low', 'Close']:
    price_df_lists[-1] = price_df_lists[-1].withColumn(col, convertPriceUdf(col))
    
#DOGE
price_df_lists[-1] = price_df_lists[-1].drop('Volume ERUR').withColumnRenamed('Volume DOG', 'Volume')
#BCH
price_df_lists[-2] = price_df_lists[-2].drop('Volume USD').withColumnRenamed('Volume BCH', 'Volume')

In [123]:
for df in price_df_lists:
    print(df.first())

Row(Date='2020-04-18-00', symbol='BTC.X', Open=7036.26, High=7064.99, Low=7028.23, Close=7064.99, Volume=4.22790759)
Row(Date='2020-04-18-00', symbol='ETH.X', Open=170.8, High=170.82, Low=170.8, Close=170.82, Volume=0.003956)
Row(Date='2020-04-18-00', symbol='LTC.X', Open=42.23, High=42.36, Low=42.16, Close=42.36, Volume=119.65145)
Row(Date='2020-04-01-11', symbol='BCH.X', Open=210.85, High=210.85, Low=210.85, Close=210.85, Volume=0.0)
Row(Date='2018-08-22-23', symbol='DOGE.X', Open=0.0021333, High=0.0021333, Low=0.0021294, Close=0.0021294, Volume=0.0)


In [124]:
from functools import reduce
from pyspark.sql import DataFrame

def unionAll(*dfs):
    return reduce(DataFrame.unionAll, dfs)

unionAll(*price_df_lists).count()

115820

In [104]:
spark.stop()
