In [1]:
!pip install findspark

[33mYou are using pip version 10.0.1, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import findspark
findspark.init()
findspark.find()
import pyspark

In [3]:
import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import udf, array, col
from pyspark.sql.types import StructType, StructField, StringType
import pandas as pd

#### LOAD SPARK SESSION

In [4]:
MONGODB_INPUT_URI = "mongodb://heroku_kvptfcm8:vbekldoic9poi92kkp810rvk7@ds141185.mlab.com:41185/heroku_kvptfcm8.runs"
MONGO_OUTPUT_URI = "mongodb://heroku_kvptfcm8:vbekldoic9poi92kkp810rvk7@ds141185.mlab.com:41185/heroku_kvptfcm8.dummy_write"
spark = SparkSession.builder.config("spark.mongodb.input.uri", MONGODB_INPUT_URI).config("spark.mongodb.output.uri", MONGO_OUTPUT_URI).config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:2.4.0').getOrCreate()
messages_df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
pandas = messages_df.toPandas() 
#messages_df is a spark df whereas pandas is a pandas dataframe. For a single host functionality, pandas df has more utility than spark df
#Issue with sql filter commands, hence resorting to pandas.
#Our spark cluster contains of a single host [local system], hence we will not notice any performance difference between a pandas dataframe and a spark sql dataframe 

In [5]:
messages_df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- body: string (nullable = true)
 |-- conversation: struct (nullable = true)
 |    |-- parent_message_id: integer (nullable = true)
 |    |-- in_reply_to_message_id: null (nullable = true)
 |    |-- parent: boolean (nullable = true)
 |    |-- replies: integer (nullable = true)
 |-- created_at: string (nullable = true)
 |-- cursor: struct (nullable = true)
 |    |-- more: boolean (nullable = true)
 |    |-- since: integer (nullable = true)
 |    |-- max: integer (nullable = true)
 |-- entities: struct (nullable = true)
 |    |-- chart: struct (nullable = true)
 |    |    |-- thumb: string (nullable = true)
 |    |    |-- large: string (nullable = true)
 |    |    |-- original: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |    |-- sentiment: struct (nullable = true)
 |    |    |-- basic: string (nullable = true)
 |-- filters: struct (nullable = true)
 |    |-- day_counts: integer 

### Create new dataframe with only required columns and filtered rows

In [6]:
required = messages_df[['_id','body','created_at','entities.sentiment','symbols.symbol', 'likes.total', 'reshares.reshared_count']].dropna(subset=['_id','body','created_at','symbol']).toPandas()
required['created_at'] = pd.to_datetime(required['created_at'])

required['total'].fillna(value=0, inplace=True)
required['reshared_count'].fillna(value=0, inplace=True)
required['interaction_count'] = required['total'] + required['reshared_count']

required = required[['_id', 'body', 'created_at', 'sentiment', 'symbol', 'interaction_count']]

#Encoding sentiment
required['sentiment'].fillna(value=0, inplace=True)

required.loc[required['sentiment'] == Row(basic="Bearish"),'sentiment']=1
required.loc[required['sentiment'] == Row(basic="Bullish"),'sentiment']=2
#Not removing null values for sentiment. Neutral tweets could still contribute to the underlying signal

#### Count of valid tweets/day

In [7]:
required['created_at'].dt.normalize().value_counts().sort_index()

2020-03-08 00:00:00+00:00       1
2020-03-09 00:00:00+00:00       1
2020-03-10 00:00:00+00:00       1
2020-03-11 00:00:00+00:00       1
2020-03-12 00:00:00+00:00       1
2020-03-13 00:00:00+00:00       3
2020-03-14 00:00:00+00:00       4
2020-03-15 00:00:00+00:00       2
2020-03-16 00:00:00+00:00       4
2020-03-17 00:00:00+00:00       7
2020-03-18 00:00:00+00:00       9
2020-03-19 00:00:00+00:00      25
2020-03-20 00:00:00+00:00      19
2020-03-21 00:00:00+00:00      29
2020-03-22 00:00:00+00:00     159
2020-03-23 00:00:00+00:00     400
2020-03-24 00:00:00+00:00     697
2020-03-25 00:00:00+00:00     956
2020-03-26 00:00:00+00:00    1074
2020-03-27 00:00:00+00:00    1722
2020-03-28 00:00:00+00:00    2626
2020-03-29 00:00:00+00:00    1709
2020-03-30 00:00:00+00:00    2099
2020-03-31 00:00:00+00:00    1535
2020-04-01 00:00:00+00:00    1798
2020-04-02 00:00:00+00:00    1830
2020-04-03 00:00:00+00:00    1900
2020-04-04 00:00:00+00:00    1000
2020-04-05 00:00:00+00:00     743
2020-04-06 00:

#### Distribution of sentiments

In [8]:
required['sentiment'].value_counts().sort_index()

0    9768
1    2244
2    8454
Name: sentiment, dtype: int64

In [9]:
#df.write.format("mongo").mode("append").option("database","DB_NAME").option("collection", "COLLECTION_NAME").save() 
#issue here without a defined schema : prefer to perform analysis on notebook and store on localhost

In [10]:
required.iloc[10:50]

Unnamed: 0,_id,body,created_at,sentiment,symbol,interaction_count
10,"(5e7ad06b80f71592bb059daa,)",$BTC.X H&amp;S hourly,2020-03-25 03:02:24+00:00,1,[BTC.X],0.0
11,"(5e7ad06b80f71592bb059dab,)","$BTC.X bitcoin, gold and silver = finite\nStoc...",2020-03-25 03:00:19+00:00,2,[BTC.X],0.0
12,"(5e7ad06b80f71592bb059dac,)",$BTC.X.,2020-03-25 03:00:17+00:00,0,[BTC.X],1.0
13,"(5e7ad06b80f71592bb059dad,)",$BTC.X Why they can’t inject $1T to bitty sh...,2020-03-25 02:57:22+00:00,0,[BTC.X],3.0
14,"(5e7ad06b80f71592bb059dae,)",$BTC.X if we soon inject 6T in the market is i...,2020-03-25 02:54:35+00:00,0,[BTC.X],0.0
15,"(5e7ad06b80f71592bb059daf,)",$BTC.X dip,2020-03-25 02:52:17+00:00,2,[BTC.X],0.0
16,"(5e7ad06b80f71592bb059db0,)",$BTC.X Our current look.,2020-03-25 02:51:25+00:00,0,[BTC.X],1.0
17,"(5e7ad06b80f71592bb059db1,)",$BTC.X $6200ish buy,2020-03-25 02:50:12+00:00,2,[BTC.X],2.0
18,"(5e7ad06b80f71592bb059db2,)",$BTC.X cash out and come back.,2020-03-25 02:49:59+00:00,0,[BTC.X],0.0
19,"(5e7ad06b80f71592bb059db3,)",$BTC.X 100% of us are getting the Corona virus...,2020-03-25 02:48:28+00:00,1,[BTC.X],2.0


### Windowing

In [13]:
# # Set the datetime column as the index
# required.index = required['created_at'] 

required['weighted_sentiment'] = required['sentiment'] * required['interaction_count']
# some tweets may have 0 interaction count, and we don't want to obliterate the sentiment data
required.loc[required['weighted_sentiment'] ==0,'weighted_sentiment']= required['sentiment']

grouped = required.groupby([required.created_at.dt.date, required.created_at.dt.hour])

required['sum_group_interaction'] = required['interaction_count'].groupby([required.created_at.dt.date, 
                                                                    required.created_at.dt.hour]).transform('sum')

required['sum_group_weighted_sentiment'] = required['weighted_sentiment'].groupby([required.created_at.dt.date, 
                                                                    required.created_at.dt.hour]).transform('sum')

required['overall_group_sentiment'] = required['sum_group_weighted_sentiment'] / required['sum_group_interaction']



# for group in grouped:
#     print(group)
#     print("\n\n ============================================== \n\n")
required.loc[required['overall_group_sentiment'] > 2.0]

Unnamed: 0,_id,body,created_at,sentiment,symbol,interaction_count,weighted_sentiment,sum_group_interaction,sum_group_weighted_sentiment,overall_group_sentiment
46,"(5e7ad06b80f71592bb059dce,)","$BSV.X I’m out, tomorrow will be ugly",2020-03-23 04:57:23+00:00,1,[BSV.X],0.0,1,0.0,39.0,inf
50,"(5e7ad06b80f71592bb059dd2,)",$BSV.X $BTC.X all y’all’s need to grow a pair....,2020-03-22 22:19:59+00:00,2,"[BTC.X, BSV.X]",5.0,10,49.0,118.0,2.408163
51,"(5e7ad06b80f71592bb059dd3,)",$BSV.X held nicely...looks like it’s going up!...,2020-03-22 22:13:06+00:00,2,[BSV.X],2.0,4,49.0,118.0,2.408163
52,"(5e7ad06b80f71592bb059dd4,)",$BSV.X reversal pattern. Imo buy now and pros...,2020-03-22 20:24:22+00:00,2,[BSV.X],4.0,8,22.0,50.0,2.272727
53,"(5e7ad06b80f71592bb059dd5,)",$BSV.X $BTC.X 🌷🌷🌷\n\nhttps://coingeek.com/ehr-...,2020-03-22 20:22:50+00:00,2,"[BTC.X, BSV.X]",0.0,2,22.0,50.0,2.272727
...,...,...,...,...,...,...,...,...,...,...
20426,"(5e8abe9efaf9e773207ff654,)","After Trashing Bitcoin, McAfee Is Promoting Th...",2020-04-06 00:02:20+00:00,0,"[BTC.X, ETH.X, XMR.X]",0.0,0,0.0,8.0,inf
20439,"(5e8abe9efaf9e773207ff661,)",$LTC.X https://investorsq.com/2020/04/04/litec...,2020-04-06 03:07:57+00:00,0,[LTC.X],0.0,0,12.0,30.0,2.500000
20441,"(5e8abe9efaf9e773207ff663,)",$LTC.X one would think the closer we get to BT...,2020-04-06 01:46:55+00:00,2,[LTC.X],0.0,2,2.0,19.0,9.500000
20442,"(5e8abe9efaf9e773207ff664,)",$LTC.X https://investorsq.com/2020/04/04/litec...,2020-04-06 01:34:45+00:00,0,[LTC.X],0.0,0,2.0,19.0,9.500000


In [12]:
prices = pd.read_csv("./prices.csv") 
#prices['Unix Timestamp'] = pd.to_datetime(prices['Unix Timestamp'],unit='ms')

FileNotFoundError: [Errno 2] File ./prices.csv does not exist: './prices.csv'

In [None]:
prices

## PERFORM AT END OF SESSION

In [None]:
spark.stop()