In [1]:
!pip install findspark

[33mYou are using pip version 10.0.1, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import findspark
findspark.init()
findspark.find()
import pyspark

In [3]:
import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import udf, array, col
from pyspark.sql.types import StructType, StructField, StringType
import pandas as pd
import random

#### LOAD SPARK SESSION

In [4]:
MONGODB_INPUT_URI = "mongodb://heroku_kvptfcm8:vbekldoic9poi92kkp810rvk7@ds141185.mlab.com:41185/heroku_kvptfcm8.runs"
MONGO_OUTPUT_URI = "mongodb://heroku_kvptfcm8:vbekldoic9poi92kkp810rvk7@ds141185.mlab.com:41185/heroku_kvptfcm8.dummy_write"
spark = SparkSession.builder.config("spark.mongodb.input.uri", MONGODB_INPUT_URI).config("spark.mongodb.output.uri", MONGO_OUTPUT_URI).config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:2.4.0').getOrCreate()
messages_df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
pandas = messages_df.toPandas() 
#messages_df is a spark df whereas pandas is a pandas dataframe. For a single host functionality, pandas df has more utility than spark df
#Issue with sql filter commands, hence resorting to pandas.
#Our spark cluster contains of a single host [local system], hence we will not notice any performance difference between a pandas dataframe and a spark sql dataframe 

In [5]:
messages_df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- body: string (nullable = true)
 |-- conversation: struct (nullable = true)
 |    |-- parent_message_id: integer (nullable = true)
 |    |-- in_reply_to_message_id: null (nullable = true)
 |    |-- parent: boolean (nullable = true)
 |    |-- replies: integer (nullable = true)
 |-- created_at: string (nullable = true)
 |-- entities: struct (nullable = true)
 |    |-- chart: struct (nullable = true)
 |    |    |-- thumb: string (nullable = true)
 |    |    |-- large: string (nullable = true)
 |    |    |-- original: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |    |-- sentiment: struct (nullable = true)
 |    |    |-- basic: string (nullable = true)
 |-- filters: struct (nullable = true)
 |    |-- day_counts: integer (nullable = true)
 |    |-- official_api: boolean (nullable = true)
 |-- id: integer (nullable = true)
 |-- likes: struct (nullable = true)
 |    |-- total: integer

### Create new dataframe with only required columns and filtered rows

In [6]:
required = messages_df[['_id','body','created_at','entities.sentiment','symbols.symbol', 'likes.total', 'reshares.reshared_count']].dropna(subset=['_id','body','created_at','symbol']).toPandas()
required['created_at'] = pd.to_datetime(required['created_at'])

required['total'].fillna(value=0, inplace=True)
required['reshared_count'].fillna(value=0, inplace=True)
required['interaction_count'] = required['total'] + required['reshared_count']

required = required[['_id', 'body', 'created_at', 'sentiment', 'symbol', 'interaction_count']]

#Encoding sentiment
required['sentiment'].fillna(value=0, inplace=True)

required.loc[required['sentiment'] == Row(basic="Bearish"),'sentiment']=1
required.loc[required['sentiment'] == Row(basic="Bullish"),'sentiment']=2
#Not removing null values for sentiment. Neutral tweets could still contribute to the underlying signal

#### Count of valid tweets/day

In [7]:
required['created_at'].dt.normalize().value_counts().sort_index()

2020-03-08 00:00:00+00:00       1
2020-03-09 00:00:00+00:00       1
2020-03-10 00:00:00+00:00       1
2020-03-11 00:00:00+00:00       1
2020-03-12 00:00:00+00:00       1
2020-03-13 00:00:00+00:00       3
2020-03-14 00:00:00+00:00       4
2020-03-15 00:00:00+00:00       2
2020-03-16 00:00:00+00:00       4
2020-03-17 00:00:00+00:00       7
2020-03-18 00:00:00+00:00       9
2020-03-19 00:00:00+00:00      25
2020-03-20 00:00:00+00:00      19
2020-03-21 00:00:00+00:00      29
2020-03-22 00:00:00+00:00     159
2020-03-23 00:00:00+00:00     400
2020-03-24 00:00:00+00:00     697
2020-03-25 00:00:00+00:00     956
2020-03-26 00:00:00+00:00    1074
2020-03-27 00:00:00+00:00    1722
2020-03-28 00:00:00+00:00    2626
2020-03-29 00:00:00+00:00    1709
2020-03-30 00:00:00+00:00    2099
2020-03-31 00:00:00+00:00    1539
2020-04-01 00:00:00+00:00    1831
2020-04-02 00:00:00+00:00    1912
2020-04-03 00:00:00+00:00    2032
2020-04-04 00:00:00+00:00    1223
2020-04-05 00:00:00+00:00    1041
2020-04-06 00:

#### Distribution of sentiments

In [8]:
required['sentiment'].value_counts().sort_index()

0    18968
1     3878
2    14540
Name: sentiment, dtype: int64

In [9]:
#df.write.format("mongo").mode("append").option("database","DB_NAME").option("collection", "COLLECTION_NAME").save() 
#issue here without a defined schema : prefer to perform analysis on notebook and store on localhost

In [10]:
required.iloc[5701]

_id                                        (5e8039f998273b042b919a41,)
body                 $BTC.X $ETH.X $XRP.X $BCH.X $EOS.X: Crypto Upd...
created_at                                   2020-03-26 13:10:29+00:00
sentiment                                                            0
symbol                             [BTC.X, ETH.X, XRP.X, EOS.X, BCH.X]
interaction_count                                                    0
Name: 5701, dtype: object

### Windowing


In [11]:
# TODO: groupby (windowing) should be done over symbols and timestamp!!! 
# but what to do for those with 3 symbols? can't group with 3 symbols.
# randomly pick one? note that the tweets are duplicated alr
# also removed [] as list is unhashable when using groupby

required['symbol'] = required.apply(lambda row: row['symbol'][0] if len(row['symbol']) == 1 
                                    else row['symbol'][random.randint(0, len(row['symbol']) - 1)], axis=1)

In [12]:
# Use tweets' likes/reshares to weigh grouped sentiments

# some tweets may have 0 interaction count, and we don't want to obliterate the sentiment data when multiplying
required.loc[required['interaction_count'] ==0,'interaction_count']= 1

# Ignore null sentiments in weighing: if sentiment is zero, put interaction count as 0, weighted sentiment as 0
required.loc[required['sentiment'] ==0,'interaction_count']= 0

required['weighted_sentiment'] = required['sentiment'] * required['interaction_count']

required['sum_group_interaction'] = required['interaction_count'].groupby([required.created_at.dt.date, 
                                                                    required.created_at.dt.hour, 
                                                                          required.symbol]).transform('sum')

required['sum_group_weighted_sentiment'] = required['weighted_sentiment'].groupby([required.created_at.dt.date, 
                                                                    required.created_at.dt.hour, 
                                                                                   required.symbol]).transform('sum')

required['overall_group_sentiment'] = required['sum_group_weighted_sentiment'] / required['sum_group_interaction']


required['overall_group_sentiment'].fillna(value=0, inplace=True)

In [14]:
required = required[['_id','symbol', 'created_at', 'overall_group_sentiment']]

required['volume_tweets'] = required['_id'].groupby([required.created_at.dt.date, 
                                                     required.created_at.dt.hour,
                                                    required.symbol]).transform('count')

required

Unnamed: 0,_id,symbol,created_at,overall_group_sentiment,volume_tweets
0,"(5e7ad06b80f71592bb059da0,)",BTC.X,2020-03-25 03:29:55+00:00,1.666667,13
1,"(5e7ad06b80f71592bb059da1,)",BTC.X,2020-03-25 03:29:20+00:00,1.666667,13
2,"(5e7ad06b80f71592bb059da2,)",BTC.X,2020-03-25 03:26:57+00:00,1.666667,13
3,"(5e7ad06b80f71592bb059da3,)",BTC.X,2020-03-25 03:22:51+00:00,1.666667,13
4,"(5e7ad06b80f71592bb059da4,)",BTC.X,2020-03-25 03:17:26+00:00,1.666667,13
...,...,...,...,...,...
37381,"(5e974c33acad2dbd709bddaf,)",LTC.X,2020-04-13 17:57:20+00:00,2.000000,47
37382,"(5e974c33acad2dbd709bddb0,)",LTC.X,2020-04-13 17:25:18+00:00,2.000000,47
37383,"(5e974c33acad2dbd709bddb1,)",LTC.X,2020-04-13 16:23:32+00:00,0.000000,22
37384,"(5e974c33acad2dbd709bddb2,)",XTZ.X,2020-04-13 15:05:08+00:00,0.000000,14


In [53]:
grouped = required.groupby([required.created_at.dt.date, required.created_at.dt.hour,required.symbol])

#### Get each group

In [76]:
first_row_groups = grouped.first()
groups_df = pd.DataFrame(first_row_groups)
groups_df.index = groups_df.index.set_names(['day', 'hour', 'symbol'])
groups_df = groups_df.reset_index()[['symbol', 'created_at', 'overall_group_sentiment', 'volume_tweets']]
groups_df

Unnamed: 0,symbol,created_at,overall_group_sentiment,volume_tweets
0,XMR.X,2020-03-08 02:50:19+00:00,0.000000,1
1,XMR.X,2020-03-09 12:21:04+00:00,0.000000,1
2,XMR.X,2020-03-10 22:12:04+00:00,0.000000,1
3,XMR.X,2020-03-11 08:30:03+00:00,0.000000,1
4,XMR.X,2020-03-12 08:30:30+00:00,0.000000,1
...,...,...,...,...
2985,BNB.X,2020-04-15 17:31:27+00:00,0.000000,1
2986,BTC.X,2020-04-15 17:59:43+00:00,1.909091,25
2987,ETH.X,2020-04-15 17:01:56+00:00,0.000000,1
2988,GBTC,2020-04-15 17:20:29+00:00,0.000000,2


### Historical Price Data

In [39]:
BTC_prices = pd.read_csv("data/gemini_BTCUSD_1hr.csv") 
ETH_prices = pd.read_csv("data/gemini_BTCUSD_1hr.csv") 
LTC_prices = pd.read_csv("data/gemini_LTCUSD_1hr.csv") 
BCH_prices = pd.read_csv("data/Bitbay_BCHUSD_1h.csv") 
DOGE_prices = pd.read_csv("data/Yobit_DOGERUR_1h.csv") 


BTC_prices['Unix Timestamp'] = pd.to_datetime(BTC_prices['Unix Timestamp'],unit='ms')
ETH_prices['Unix Timestamp'] = pd.to_datetime(ETH_prices['Unix Timestamp'],unit='ms')
LTC_prices['Unix Timestamp'] = pd.to_datetime(LTC_prices['Unix Timestamp'],unit='ms')
BCH_prices['Date'] = pd.to_datetime(BCH_prices['Date'], format='%Y-%m-%d %I-%p')
DOGE_prices['Date'] = pd.to_datetime(DOGE_prices['Date'], format='%Y-%m-%d %I-%p')

#### Data Cleaning: DOGE (RUR to USD)

In [43]:
RUR_USD = 0.013

DOGE_prices['Open'] = DOGE_prices['Open'].apply(lambda x: x* RUR_USD )  
DOGE_prices['High'] = DOGE_prices['High'].apply(lambda x: x* RUR_USD )  
DOGE_prices['Low'] = DOGE_prices['Low'].apply(lambda x: x* RUR_USD )  
DOGE_prices['Close'] = DOGE_prices['Close'].apply(lambda x: x* RUR_USD )  
DOGE_prices['Symbol'] = DOGE_prices['Symbol'].apply(lambda x: 'DOGEUSD' )  

In [44]:
DOGE_prices

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Volume DOG,Volume ERUR
0,2018-08-22 23:00:00,DOGEUSD,0.002133,0.002133,0.002129,0.002129,0.00,0.00
1,2018-08-22 22:00:00,DOGEUSD,0.002145,0.002145,0.002133,0.002133,94441.50,15518.09
2,2018-08-22 21:00:00,DOGEUSD,0.002139,0.002150,0.002136,0.002145,53362.95,8771.07
3,2018-08-22 20:00:00,DOGEUSD,0.002153,0.002172,0.002139,0.002139,102170.60,16907.12
4,2018-08-22 19:00:00,DOGEUSD,0.002153,0.002174,0.002153,0.002153,164307.95,27223.80
...,...,...,...,...,...,...,...,...
7176,2017-10-26 22:00:00,DOGEUSD,0.000771,0.000771,0.000771,0.000771,0.00,0.00
7177,2017-10-26 21:00:00,DOGEUSD,0.000771,0.000771,0.000771,0.000771,0.00,0.00
7178,2017-10-26 20:00:00,DOGEUSD,0.000771,0.000771,0.000771,0.000771,0.00,0.00
7179,2017-10-26 19:00:00,DOGEUSD,0.000771,0.000771,0.000771,0.000771,0.00,0.00


#### Join with price data

## PERFORM AT END OF SESSION

In [18]:
spark.stop()