In [235]:
import findspark
findspark.init()
findspark.find()
import pyspark

In [236]:
import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import udf, array, col
from pyspark.sql.types import StructType, StructField, StringType
import pandas as pd

#### LOAD SPARK SESSION

In [237]:
MONGODB_INPUT_URI = "mongodb://heroku_kvptfcm8:vbekldoic9poi92kkp810rvk7@ds141185.mlab.com:41185/heroku_kvptfcm8.runs"
MONGO_OUTPUT_URI = "mongodb://heroku_kvptfcm8:vbekldoic9poi92kkp810rvk7@ds141185.mlab.com:41185/heroku_kvptfcm8.dummy_write"
spark = SparkSession.builder.config("spark.mongodb.input.uri", MONGODB_INPUT_URI).config("spark.mongodb.output.uri", MONGO_OUTPUT_URI).config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:2.4.0').getOrCreate()
messages_df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
pandas = messages_df.toPandas() 
#messages_df is a spark df whereas pandas is a pandas dataframe. For a single host functionality, pandas df has more utility than spark df
#Issue with sql filter commands, hence resorting to pandas.
#Our spark cluster contains of a single host [local system], hence we will not notice any performance difference between a pandas dataframe and a spark sql dataframe 

### Create new dataframe with only required columns and filtered rows

In [230]:
required = messages_df[['_id','body','created_at','entities.sentiment','symbols.symbol']].dropna(subset=['_id','body','created_at','symbol']).toPandas()
required['created_at'] = pd.to_datetime(required['created_at'])

#Encoding sentiment
required['sentiment'].fillna(value=0, inplace=True)
required.loc[required['sentiment'] == Row(basic="Bearish"),'sentiment']=1
required.loc[required['sentiment'] == Row(basic="Bullish"),'sentiment']=2
#Not removing null values for sentiment. Neutral tweets could still contribute to the underlying signal

#### Count of valid tweets/day

In [232]:
required['created_at'].dt.normalize().value_counts().sort_index()

2020-03-08 00:00:00+00:00       1
2020-03-09 00:00:00+00:00       1
2020-03-10 00:00:00+00:00       1
2020-03-11 00:00:00+00:00       1
2020-03-12 00:00:00+00:00       1
2020-03-13 00:00:00+00:00       3
2020-03-14 00:00:00+00:00       4
2020-03-15 00:00:00+00:00       2
2020-03-16 00:00:00+00:00       4
2020-03-17 00:00:00+00:00       7
2020-03-18 00:00:00+00:00       9
2020-03-19 00:00:00+00:00      25
2020-03-20 00:00:00+00:00      19
2020-03-21 00:00:00+00:00      29
2020-03-22 00:00:00+00:00     159
2020-03-23 00:00:00+00:00     400
2020-03-24 00:00:00+00:00     697
2020-03-25 00:00:00+00:00     956
2020-03-26 00:00:00+00:00    1074
2020-03-27 00:00:00+00:00    1722
2020-03-28 00:00:00+00:00    2626
2020-03-29 00:00:00+00:00    1705
2020-03-30 00:00:00+00:00    2077
2020-03-31 00:00:00+00:00    1500
2020-04-01 00:00:00+00:00    1655
2020-04-02 00:00:00+00:00    1700
2020-04-03 00:00:00+00:00    1603
2020-04-04 00:00:00+00:00     596
2020-04-05 00:00:00+00:00      89
Name: created_

In [None]:
#df.write.format("mongo").mode("append").option("database","DB_NAME").option("collection", "COLLECTION_NAME").save() 
#issue here without a defined schema : prefer to perform analysis on notebook and store on localhost

In [204]:
messages_df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- body: string (nullable = true)
 |-- conversation: struct (nullable = true)
 |    |-- parent_message_id: integer (nullable = true)
 |    |-- in_reply_to_message_id: null (nullable = true)
 |    |-- parent: boolean (nullable = true)
 |    |-- replies: integer (nullable = true)
 |-- created_at: string (nullable = true)
 |-- entities: struct (nullable = true)
 |    |-- chart: struct (nullable = true)
 |    |    |-- thumb: string (nullable = true)
 |    |    |-- large: string (nullable = true)
 |    |    |-- original: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |    |-- sentiment: struct (nullable = true)
 |    |    |-- basic: string (nullable = true)
 |-- filters: struct (nullable = true)
 |    |-- day_counts: integer (nullable = true)
 |    |-- official_api: boolean (nullable = true)
 |-- id: integer (nullable = true)
 |-- likes: struct (nullable = true)
 |    |-- total: integer

In [231]:
required

Unnamed: 0,_id,body,created_at,sentiment,symbol
0,"(5e7ad06b80f71592bb059da0,)",$BTC.X When Stalin and the Red Army were closi...,2020-03-25 03:29:55+00:00,0,[BTC.X]
1,"(5e7ad06b80f71592bb059da1,)",$BTC.X Bitcoin&#39;s role in South Korea&#39;s...,2020-03-25 03:29:20+00:00,0,[BTC.X]
2,"(5e7ad06b80f71592bb059da2,)",$BTC.X Mike Novogratz Says Gold&#39;s Rally Is...,2020-03-25 03:26:57+00:00,0,[BTC.X]
3,"(5e7ad06b80f71592bb059da4,)",$BTC.X holding up nicely!,2020-03-25 03:17:26+00:00,2,[BTC.X]
4,"(5e7ad06b80f71592bb059da3,)",$BTC.X whats this &quot;digital dollar nonsens...,2020-03-25 03:22:51+00:00,0,[BTC.X]
...,...,...,...,...,...
18661,"(5e8950eace597485bd860077,)",$BTC.X $LTC.X $LINK.X Unchain Bitcoin and let ...,2020-04-03 09:01:37+00:00,2,"[BTC.X, LTC.X, LINK.X]"
18662,"(5e8950eace597485bd860078,)",$BTC.X $LTC.X $LINK.X Bears if Bitcoin goes to...,2020-04-03 08:54:55+00:00,1,"[BTC.X, LTC.X, LINK.X]"
18663,"(5e8950eace597485bd860079,)",$BTC.X $LTC.X $LINK.X Bulls if Bitcoin goes to...,2020-04-03 08:53:58+00:00,1,"[BTC.X, LTC.X, LINK.X]"
18664,"(5e8950eace597485bd86007a,)",$BTC.X $LTC.X $LINK.X Bulls if Bitcoin goes pa...,2020-04-03 08:50:17+00:00,2,"[BTC.X, LTC.X, LINK.X]"


In [243]:
prices = pd.read_csv("./prices.csv") 
#prices['Unix Timestamp'] = pd.to_datetime(prices['Unix Timestamp'],unit='ms')

In [251]:
prices

Unnamed: 0,Unix Timestamp,Date,Symbol,Open,High,Low,Close,Volume
0,2020-03-21 00:20:00.000,3/21/20 0:00,BTCUSD,6205.18,6223.78,6205.18,6214.69,0.171939
1,2020-03-21 00:20:00.000,3/20/20 23:00,BTCUSD,6163.95,6237.54,6078.50,6205.18,148.716812
2,2020-03-20 21:33:20.000,3/20/20 22:00,BTCUSD,6245.47,6370.61,6065.00,6163.95,465.573263
3,2020-03-20 21:33:20.000,3/20/20 21:00,BTCUSD,5973.35,6257.61,5924.45,6245.47,163.673391
4,2020-03-20 18:46:40.000,3/20/20 20:00,BTCUSD,6200.11,6200.11,5660.77,5973.35,958.239215
...,...,...,...,...,...,...,...,...
39004,1970-01-17 17:12:03.600,10/8/15 17:00,BTCUSD,244.25,244.99,244.02,244.99,3.920632
39005,1970-01-17 17:12:00.000,10/8/15 16:00,BTCUSD,244.92,244.92,244.25,244.25,3.895252
39006,1970-01-17 17:11:56.400,10/8/15 15:00,BTCUSD,245.00,245.00,244.92,244.92,3.016926
39007,1970-01-17 17:11:52.800,10/8/15 14:00,BTCUSD,245.00,245.00,244.50,245.00,4.453649


## PERFORM AT END OF SESSION

In [234]:
spark.stop()