In [1]:
!pip install findspark

[33mYou are using pip version 10.0.1, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import findspark
findspark.init()
findspark.find()
import pyspark

In [3]:
import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import udf, array, col
from pyspark.sql.types import StructType, StructField, StringType
import pandas as pd
import random

#### LOAD SPARK SESSION

In [4]:
MONGODB_INPUT_URI = "mongodb://heroku_kvptfcm8:vbekldoic9poi92kkp810rvk7@ds141185.mlab.com:41185/heroku_kvptfcm8.runs"
MONGO_OUTPUT_URI = "mongodb://heroku_kvptfcm8:vbekldoic9poi92kkp810rvk7@ds141185.mlab.com:41185/heroku_kvptfcm8.dummy_write"
spark = SparkSession.builder.config("spark.mongodb.input.uri", MONGODB_INPUT_URI).config("spark.mongodb.output.uri", MONGO_OUTPUT_URI).config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:2.4.0').getOrCreate()
messages_df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
pandas = messages_df.toPandas() 
#messages_df is a spark df whereas pandas is a pandas dataframe. For a single host functionality, pandas df has more utility than spark df
#Issue with sql filter commands, hence resorting to pandas.
#Our spark cluster contains of a single host [local system], hence we will not notice any performance difference between a pandas dataframe and a spark sql dataframe 

In [5]:
SYMBOLS_LIST = ['BTC.X', 'BSV.X', 'BCH.X', 'LTC.X', 'ETH.X', 'DOGE.X']

In [6]:
messages_df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- body: string (nullable = true)
 |-- conversation: struct (nullable = true)
 |    |-- parent_message_id: integer (nullable = true)
 |    |-- in_reply_to_message_id: null (nullable = true)
 |    |-- parent: boolean (nullable = true)
 |    |-- replies: integer (nullable = true)
 |-- created_at: string (nullable = true)
 |-- entities: struct (nullable = true)
 |    |-- chart: struct (nullable = true)
 |    |    |-- thumb: string (nullable = true)
 |    |    |-- large: string (nullable = true)
 |    |    |-- original: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |    |-- sentiment: struct (nullable = true)
 |    |    |-- basic: string (nullable = true)
 |-- filters: struct (nullable = true)
 |    |-- day_counts: integer (nullable = true)
 |    |-- official_api: boolean (nullable = true)
 |-- id: integer (nullable = true)
 |-- likes: struct (nullable = true)
 |    |-- total: integer

### Create new dataframe with only required columns and filtered rows

In [7]:
required = messages_df[['_id','body','created_at','entities.sentiment','symbols.symbol', 'likes.total', 'reshares.reshared_count']].dropna(subset=['_id','body','created_at','symbol']).toPandas()
required['created_at'] = pd.to_datetime(required['created_at'])

required['total'].fillna(value=0, inplace=True)
required['reshared_count'].fillna(value=0, inplace=True)
required['interaction_count'] = required['total'] + required['reshared_count']

required = required[['_id', 'body', 'created_at', 'sentiment', 'symbol', 'interaction_count']]

#Encoding sentiment
required['sentiment'].fillna(value=0, inplace=True)

required.loc[required['sentiment'] == Row(basic="Bearish"),'sentiment']=1
required.loc[required['sentiment'] == Row(basic="Bullish"),'sentiment']=2
#Not removing null values for sentiment. Neutral tweets could still contribute to the underlying signal

#### Count of valid tweets/day

In [8]:
required['created_at'].dt.normalize().value_counts().sort_index()

2020-03-08 00:00:00+00:00       1
2020-03-09 00:00:00+00:00       1
2020-03-10 00:00:00+00:00       1
2020-03-11 00:00:00+00:00       1
2020-03-12 00:00:00+00:00       1
2020-03-13 00:00:00+00:00       3
2020-03-14 00:00:00+00:00       4
2020-03-15 00:00:00+00:00       2
2020-03-16 00:00:00+00:00       4
2020-03-17 00:00:00+00:00       7
2020-03-18 00:00:00+00:00       9
2020-03-19 00:00:00+00:00      25
2020-03-20 00:00:00+00:00      19
2020-03-21 00:00:00+00:00      29
2020-03-22 00:00:00+00:00     159
2020-03-23 00:00:00+00:00     400
2020-03-24 00:00:00+00:00     697
2020-03-25 00:00:00+00:00     956
2020-03-26 00:00:00+00:00    1074
2020-03-27 00:00:00+00:00    1722
2020-03-28 00:00:00+00:00    2626
2020-03-29 00:00:00+00:00    1709
2020-03-30 00:00:00+00:00    2099
2020-03-31 00:00:00+00:00    1539
2020-04-01 00:00:00+00:00    1831
2020-04-02 00:00:00+00:00    1912
2020-04-03 00:00:00+00:00    2032
2020-04-04 00:00:00+00:00    1223
2020-04-05 00:00:00+00:00    1041
2020-04-06 00:

#### Distribution of sentiments

In [9]:
required['sentiment'].value_counts().sort_index()

0    21618
1     4237
2    16391
Name: sentiment, dtype: int64

In [10]:
#df.write.format("mongo").mode("append").option("database","DB_NAME").option("collection", "COLLECTION_NAME").save() 
#issue here without a defined schema : prefer to perform analysis on notebook and store on localhost

### Windowing


#### Reduce symbols to just one symbol

In [11]:
# TODO: groupby (windowing) should be done over symbols and timestamp!!! 
# but what to do for those with 3 symbols? can't group with 3 symbols.
# randomly pick one? note that the tweets are duplicated alr
# also removed [] as list is unhashable when using groupby

def pick_one_symbol(symbols):
    index = 0
    while (index < len(symbols)):
        if (symbols[index] in SYMBOLS_LIST):
            return symbols[index]
        index += 1
    
    return None
        
required['symbol'] = required['symbol'].apply(lambda symbols: pick_one_symbol(symbols))
# required = required.dropna(subset=['symbol'])
required

Unnamed: 0,_id,body,created_at,sentiment,symbol,interaction_count
0,"(5e7ad06b80f71592bb059da0,)",$BTC.X When Stalin and the Red Army were closi...,2020-03-25 03:29:55+00:00,0,BTC.X,0.0
1,"(5e7ad06b80f71592bb059da1,)",$BTC.X Bitcoin&#39;s role in South Korea&#39;s...,2020-03-25 03:29:20+00:00,0,BTC.X,0.0
2,"(5e7ad06b80f71592bb059da2,)",$BTC.X Mike Novogratz Says Gold&#39;s Rally Is...,2020-03-25 03:26:57+00:00,0,BTC.X,0.0
3,"(5e7ad06b80f71592bb059da3,)",$BTC.X whats this &quot;digital dollar nonsens...,2020-03-25 03:22:51+00:00,0,BTC.X,0.0
4,"(5e7ad06b80f71592bb059da4,)",$BTC.X holding up nicely!,2020-03-25 03:17:26+00:00,2,BTC.X,0.0
...,...,...,...,...,...,...
42241,"(5e9a9eb7267ccecbb4c60e85,)",$LTC.X wow she running and many of u asleep lol,2020-04-16 07:42:05+00:00,0,LTC.X,1.0
42242,"(5e9a9eb7267ccecbb4c60e86,)",$LINK.X https://stocktwits.com/r/linklongtermh...,2020-04-16 07:28:39+00:00,2,BTC.X,1.0
42243,"(5e9a9eb7267ccecbb4c60e87,)",$BTC.X . \n \n$BSV.X $LTC.X $ETC.X,2020-04-16 07:19:20+00:00,0,BTC.X,9.0
42244,"(5e9a9eb7267ccecbb4c60e88,)",$BTC.X $LTC.X $XRP.X When you don’t qualify fo...,2020-04-16 05:47:58+00:00,2,BTC.X,2.0


#### Create new columns: interaction count, weighted sentiments

In [12]:
# Use tweets' likes/reshares to weigh grouped sentiments

# some tweets may have 0 interaction count, and we don't want to obliterate the sentiment data when multiplying
required.loc[required['interaction_count'] ==0,'interaction_count']= 1

# Ignore null sentiments in weighing: if sentiment is zero, put interaction count as 0, weighted sentiment as 0
required.loc[required['sentiment'] ==0,'interaction_count']= 0

required['weighted_sentiment'] = required['sentiment'] * required['interaction_count']

required['sum_group_interaction'] = required['interaction_count'].groupby([required.created_at.dt.date, 
                                                                    required.created_at.dt.hour, 
                                                                          required.symbol]).transform('sum')

required['sum_group_weighted_sentiment'] = required['weighted_sentiment'].groupby([required.created_at.dt.date, 
                                                                    required.created_at.dt.hour, 
                                                                                   required.symbol]).transform('sum')

required['overall_group_sentiment'] = required['sum_group_weighted_sentiment'] / required['sum_group_interaction']


required['overall_group_sentiment'].fillna(value=0, inplace=True)

In [13]:
required = required[['_id','symbol', 'created_at', 'overall_group_sentiment']]

required['volume_tweets'] = required['_id'].groupby([required.created_at.dt.date, 
                                                     required.created_at.dt.hour,
                                                    required.symbol]).transform('count')

required

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,_id,symbol,created_at,overall_group_sentiment,volume_tweets
0,"(5e7ad06b80f71592bb059da0,)",BTC.X,2020-03-25 03:29:55+00:00,1.666667,13.0
1,"(5e7ad06b80f71592bb059da1,)",BTC.X,2020-03-25 03:29:20+00:00,1.666667,13.0
2,"(5e7ad06b80f71592bb059da2,)",BTC.X,2020-03-25 03:26:57+00:00,1.666667,13.0
3,"(5e7ad06b80f71592bb059da3,)",BTC.X,2020-03-25 03:22:51+00:00,1.666667,13.0
4,"(5e7ad06b80f71592bb059da4,)",BTC.X,2020-03-25 03:17:26+00:00,1.666667,13.0
...,...,...,...,...,...
42241,"(5e9a9eb7267ccecbb4c60e85,)",LTC.X,2020-04-16 07:42:05+00:00,0.000000,22.0
42242,"(5e9a9eb7267ccecbb4c60e86,)",BTC.X,2020-04-16 07:28:39+00:00,1.905405,107.0
42243,"(5e9a9eb7267ccecbb4c60e87,)",BTC.X,2020-04-16 07:19:20+00:00,1.905405,107.0
42244,"(5e9a9eb7267ccecbb4c60e88,)",BTC.X,2020-04-16 05:47:58+00:00,2.000000,29.0


In [14]:
grouped = required.groupby([required.created_at.dt.date, required.created_at.dt.hour,required.symbol])

#### Get each group

In [15]:
first_row_groups = grouped.first()
groups_df = pd.DataFrame(first_row_groups)
groups_df.index = groups_df.index.set_names(['day', 'hour', 'symbol_a'])
groups_df = groups_df.reset_index()

def pad_zero(x):
    return '0' + str(x) if (x < 10) else str(x)

groups_df['hour'] = groups_df['hour'].apply(lambda x: pad_zero(x))
groups_df['timestamp'] = pd.to_datetime(groups_df['day'].astype(str) + ' ' +
                                  groups_df['hour'].astype(str), format = '%Y-%m-%d %H', utc = True)
                                  
groups_df = groups_df.reset_index()[['symbol', 'timestamp', 'overall_group_sentiment', 'volume_tweets']]
groups_df

Unnamed: 0,symbol,timestamp,overall_group_sentiment,volume_tweets
0,BTC.X,2020-03-14 17:00:00+00:00,0.000000,2.0
1,BTC.X,2020-03-18 09:00:00+00:00,0.000000,1.0
2,BTC.X,2020-03-18 13:00:00+00:00,0.000000,1.0
3,BTC.X,2020-03-19 00:00:00+00:00,2.000000,1.0
4,BTC.X,2020-03-19 01:00:00+00:00,0.000000,1.0
...,...,...,...,...
1735,LTC.X,2020-04-18 04:00:00+00:00,2.000000,1.0
1736,BTC.X,2020-04-18 05:00:00+00:00,1.892857,20.0
1737,DOGE.X,2020-04-18 05:00:00+00:00,2.000000,1.0
1738,BCH.X,2020-04-18 06:00:00+00:00,2.000000,2.0


#### Summary of groups

In [16]:
print("Summarizing number of groups per symbol")
print(groups_df['symbol'].value_counts().sort_index())

print("\nSummarizing timestamps")
print(groups_df['timestamp'].value_counts().sort_index())

Summarizing number of groups per symbol
BCH.X     132
BSV.X     238
BTC.X     591
DOGE.X    141
ETH.X     392
LTC.X     246
Name: symbol, dtype: int64

Summarizing timestamps
2020-03-14 17:00:00+00:00    1
2020-03-18 09:00:00+00:00    1
2020-03-18 13:00:00+00:00    1
2020-03-19 00:00:00+00:00    1
2020-03-19 01:00:00+00:00    1
                            ..
2020-04-18 02:00:00+00:00    3
2020-04-18 03:00:00+00:00    2
2020-04-18 04:00:00+00:00    4
2020-04-18 05:00:00+00:00    2
2020-04-18 06:00:00+00:00    2
Name: timestamp, Length: 646, dtype: int64


## Historical Price Data

In [17]:
BTC_prices = pd.read_csv("data/gemini_BTCUSD_1hr.csv") 
ETH_prices = pd.read_csv("data/gemini_ETHUSD_1hr.csv") 
LTC_prices = pd.read_csv("data/gemini_LTCUSD_1hr.csv") 
BCH_prices = pd.read_csv("data/Bitbay_BCHUSD_1h.csv") 
DOGE_prices = pd.read_csv("data/Yobit_DOGERUR_1h.csv") 

price_df_lists = [BTC_prices, ETH_prices, LTC_prices, BCH_prices, DOGE_prices]

#### Data Cleaning

#### 1. Datetime

In [18]:
# BCH_prices, DOGE_prices can't infer datetime format
for index in range(len(price_df_lists) - 2):
    price_df = price_df_lists[index]
    price_df['Date'] = pd.to_datetime(price_df['Date'], infer_datetime_format = True, utc = True)
    price_df['Date'] = price_df['Date'].fillna(pd.to_datetime(price_df['Unix Timestamp'], utc = True))
    price_df.drop(['Unix Timestamp'], axis=1, inplace=True)

BCH_prices['Date'] = pd.to_datetime(BCH_prices['Date'], format = '%Y-%m-%d %I-%p', utc = True)
DOGE_prices['Date'] = pd.to_datetime(DOGE_prices['Date'], format = '%Y-%m-%d %I-%p', utc = True)

#### 2. Doge (RUR to USD)

In [19]:
RUR_USD = 0.013

DOGE_prices['Open'] = DOGE_prices['Open'].apply(lambda x: x* RUR_USD )  
DOGE_prices['High'] = DOGE_prices['High'].apply(lambda x: x* RUR_USD )  
DOGE_prices['Low'] = DOGE_prices['Low'].apply(lambda x: x* RUR_USD )  
DOGE_prices['Close'] = DOGE_prices['Close'].apply(lambda x: x* RUR_USD)
DOGE_prices.drop(['Volume ERUR'], axis=1, inplace=True)
DOGE_prices.rename(columns={"Volume DOG": "Volume"}, errors="raise", inplace=True)

BCH_prices.drop(['Volume USD'], axis=1, inplace=True)
BCH_prices.rename(columns={"Volume BCH": "Volume"}, errors="raise", inplace=True)

#### 3. Symbols

In [20]:
for index in range(len(price_df_lists)):
    price_df = price_df_lists[index]
    price_df['Symbol'] = price_df['Symbol'].apply(lambda x: SYMBOLS_LIST[index]) 

In [21]:
prices_df = pd.concat(price_df_lists)

## Join with price data (tweets k-h earlier)

In [22]:
k_list = [2, 5, 12, 24, 48]
merged_df_k_list = []

In [23]:
for k in k_list:
    groups_df["modified_timestamp"] = groups_df["timestamp"] + pd.DateOffset(hours=k)
    merged_df = groups_df.merge(prices_df, how='left', left_on=["modified_timestamp", "symbol"], right_on=["Date","Symbol"])
    print("Number of unmatched tweet group for k = " + str(k) + ": "+ str(len(merged_df[merged_df["Date"].isnull()].index)))
    print("Number of matched tweet groups for k = " + str(k) + ": "+ str(len(merged_df.index) - len(merged_df[merged_df["Date"].isnull()].index)))
    print("\n")
    merged_df_k_list.append(merged_df)

merged_df_k_list[0]

Number of unmatched tweet group for k = 2: 698
Number of matched tweet groups for k = 2: 1042


Number of unmatched tweet group for k = 5: 703
Number of matched tweet groups for k = 5: 1037


Number of unmatched tweet group for k = 12: 715
Number of matched tweet groups for k = 12: 1025


Number of unmatched tweet group for k = 24: 732
Number of matched tweet groups for k = 24: 1008


Number of unmatched tweet group for k = 48: 780
Number of matched tweet groups for k = 48: 960




Unnamed: 0,symbol,timestamp,overall_group_sentiment,volume_tweets,modified_timestamp,Date,Symbol,Open,High,Low,Close,Volume
0,BTC.X,2020-03-14 17:00:00+00:00,0.000000,2.0,2020-03-14 19:00:00+00:00,2020-03-14 19:00:00+00:00,BTC.X,5465.84,5502.47,5390.00,5390.04,62.994055
1,BTC.X,2020-03-18 09:00:00+00:00,0.000000,1.0,2020-03-18 11:00:00+00:00,2020-03-18 11:00:00+00:00,BTC.X,5102.97,5181.20,5098.07,5165.00,121.411455
2,BTC.X,2020-03-18 13:00:00+00:00,0.000000,1.0,2020-03-18 15:00:00+00:00,2020-03-18 15:00:00+00:00,BTC.X,5325.71,5391.06,5239.85,5351.49,230.713793
3,BTC.X,2020-03-19 00:00:00+00:00,2.000000,1.0,2020-03-19 02:00:00+00:00,2020-03-19 02:00:00+00:00,BTC.X,5375.03,5383.30,5277.50,5304.75,113.569030
4,BTC.X,2020-03-19 01:00:00+00:00,0.000000,1.0,2020-03-19 03:00:00+00:00,2020-03-19 03:00:00+00:00,BTC.X,5304.75,5348.55,5270.00,5305.00,112.730393
...,...,...,...,...,...,...,...,...,...,...,...,...
1735,LTC.X,2020-04-18 04:00:00+00:00,2.000000,1.0,2020-04-18 06:00:00+00:00,NaT,,,,,,
1736,BTC.X,2020-04-18 05:00:00+00:00,1.892857,20.0,2020-04-18 07:00:00+00:00,NaT,,,,,,
1737,DOGE.X,2020-04-18 05:00:00+00:00,2.000000,1.0,2020-04-18 07:00:00+00:00,NaT,,,,,,
1738,BCH.X,2020-04-18 06:00:00+00:00,2.000000,2.0,2020-04-18 08:00:00+00:00,NaT,,,,,,


## Cryptocurrency-specific data

In [24]:
BTC_data = pd.read_csv("data/btc.csv") 
ETH_data = pd.read_csv("data/eth.csv") 
LTC_data = pd.read_csv("data/ltc.csv") 
BCH_data = pd.read_csv("data/bch.csv") 
DOGE_data = pd.read_csv("data/doge.csv") 

BTC_data["coin"] = "BTC.X"
ETH_data["coin"] = "ETH.X"
LTC_data["coin"] = "LTC.X"
BCH_data["coin"] = "BCH.X"
DOGE_data["coin"] = "DOGE.X"

data_df_lists = [BTC_data, ETH_data, LTC_data, BCH_data, DOGE_data]

for df in data_df_lists:
    df.dropna(inplace = True)
    df['date'] = pd.to_datetime(df['date'], utc = True)
    
data_df_combined = pd.concat(data_df_lists)
data_df_combined

Unnamed: 0,date,AdrActCnt,BlkCnt,BlkSizeByte,BlkSizeMeanByte,CapMVRVCur,CapMrktCurUSD,CapRealUSD,DiffMean,FeeMeanNtv,...,TxTfrValMeanNtv,TxTfrValMeanUSD,TxTfrValMedNtv,TxTfrValMedUSD,TxTfrValNtv,TxTfrValUSD,VtyDayRet180d,VtyDayRet30d,VtyDayRet60d,coin
926,2011-07-18 00:00:00+00:00,27778,170,4578379,26931.641176,1.942074,9.334931e+07,4.806681e+07,1.563028e+06,0.001486,...,139.134619,1896.477386,0.350000,4.770682,3.298186e+06,4.495600e+07,0.096840,0.039919,0.115691,BTC.X
927,2011-07-19 00:00:00+00:00,25356,142,4178498,29426.042254,1.964390,9.477795e+07,4.824804e+07,1.588241e+06,0.001642,...,110.410539,1526.403051,0.312340,4.318042,2.735642e+06,3.781969e+07,0.095685,0.039263,0.112180,BTC.X
928,2011-07-20 00:00:00+00:00,25461,149,3834661,25735.979866,1.962968,9.387575e+07,4.782336e+07,1.690896e+06,0.001202,...,22.315422,305.237722,0.200000,2.735666,5.411713e+05,7.402320e+06,0.095626,0.039237,0.111902,BTC.X
929,2011-07-21 00:00:00+00:00,25378,152,3672287,24159.782895,1.952017,9.352265e+07,4.791077e+07,1.690896e+06,0.001113,...,54.063834,735.906549,0.130000,1.769535,1.329538e+06,1.809741e+07,0.095621,0.039212,0.111371,BTC.X
930,2011-07-22 00:00:00+00:00,24428,151,3850125,25497.516556,1.963113,9.422203e+07,4.799623e+07,1.690896e+06,0.001555,...,65.213669,893.332526,0.500000,6.849274,1.554629e+06,2.129615e+07,0.095620,0.039276,0.111251,BTC.X
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2318,2020-04-13 00:00:00+00:00,47826,1377,14669230,10653.035585,0.781697,2.410942e+08,3.084241e+08,2.186597e+06,1.226929,...,57750.247250,112.247974,500.000000,0.971840,3.528425e+09,6.858127e+06,0.045386,0.036191,0.064651,DOGE.X
2319,2020-04-14 00:00:00+00:00,45475,1354,14143603,10445.792467,0.795561,2.453593e+08,3.084106e+08,2.143355e+06,1.218111,...,36429.378485,72.051792,512.704826,1.014052,2.198549e+09,4.348398e+06,0.045155,0.036257,0.064405,DOGE.X
2320,2020-04-15 00:00:00+00:00,44544,1370,14160152,10335.877372,0.762922,2.352640e+08,3.083722e+08,2.248554e+06,1.229781,...,39681.331076,75.246150,530.000000,1.005018,2.356595e+09,4.468718e+06,0.045144,0.035399,0.063706,DOGE.X
2321,2020-04-16 00:00:00+00:00,45480,1379,14409501,10449.239304,0.818179,2.523657e+08,3.084481e+08,2.262279e+06,1.257628,...,31287.481268,63.634875,500.000000,1.016938,1.878532e+09,3.820702e+06,0.045401,0.036003,0.064250,DOGE.X


## Join merged data (tweets + price) with cryptocurrency-specific data

### Extract date from merged data

In [25]:
for merged_df in merged_df_k_list:
    merged_df["date"] = pd.to_datetime(merged_df['Date'].dt.date, utc = True)

### Merge

In [29]:
all_merged_list = []

for merged_df in merged_df_k_list:
    all_merged_df = merged_df.merge(data_df_combined, how='left', left_on=["date", "symbol"], right_on=["date", "coin"])
    print("Number of unmatched rows: "+ str(len(all_merged_df[all_merged_df["coin"].isnull()].index)))
    print("Number of matched rows: "+ str(len(all_merged_df.index) - len(all_merged_df[all_merged_df["coin"].isnull()].index)))
    print("\n")
    all_merged_list.append(all_merged_df)

all_merged_list[0]

Number of unmatched rows: 937
Number of matched rows: 803


Number of unmatched rows: 941
Number of matched rows: 799


Number of unmatched rows: 952
Number of matched rows: 788


Number of unmatched rows: 969
Number of matched rows: 771


Number of unmatched rows: 1009
Number of matched rows: 731




Unnamed: 0,symbol,timestamp,overall_group_sentiment,volume_tweets,modified_timestamp,Date,Symbol,Open,High,Low,...,TxTfrValMeanNtv,TxTfrValMeanUSD,TxTfrValMedNtv,TxTfrValMedUSD,TxTfrValNtv,TxTfrValUSD,VtyDayRet180d,VtyDayRet30d,VtyDayRet60d,coin
0,BTC.X,2020-03-14 17:00:00+00:00,0.000000,2.0,2020-03-14 19:00:00+00:00,2020-03-14 19:00:00+00:00,BTC.X,5465.84,5502.47,5390.00,...,1.088403,5600.021500,0.011144,57.338523,728156.883353,3.746493e+09,0.047852,0.094492,0.069323,BTC.X
1,BTC.X,2020-03-18 09:00:00+00:00,0.000000,1.0,2020-03-18 11:00:00+00:00,2020-03-18 11:00:00+00:00,BTC.X,5102.97,5181.20,5098.07,...,1.242803,6701.801085,0.015410,83.096404,740148.953272,3.991244e+09,0.048563,0.096938,0.070792,BTC.X
2,BTC.X,2020-03-18 13:00:00+00:00,0.000000,1.0,2020-03-18 15:00:00+00:00,2020-03-18 15:00:00+00:00,BTC.X,5325.71,5391.06,5239.85,...,1.242803,6701.801085,0.015410,83.096404,740148.953272,3.991244e+09,0.048563,0.096938,0.070792,BTC.X
3,BTC.X,2020-03-19 00:00:00+00:00,2.000000,1.0,2020-03-19 02:00:00+00:00,2020-03-19 02:00:00+00:00,BTC.X,5375.03,5383.30,5277.50,...,1.165243,7228.646293,0.016079,99.748520,750336.209937,4.654749e+09,0.049716,0.100462,0.073290,BTC.X
4,BTC.X,2020-03-19 01:00:00+00:00,0.000000,1.0,2020-03-19 03:00:00+00:00,2020-03-19 03:00:00+00:00,BTC.X,5304.75,5348.55,5270.00,...,1.165243,7228.646293,0.016079,99.748520,750336.209937,4.654749e+09,0.049716,0.100462,0.073290,BTC.X
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1735,LTC.X,2020-04-18 04:00:00+00:00,2.000000,1.0,2020-04-18 06:00:00+00:00,NaT,,,,,...,,,,,,,,,,
1736,BTC.X,2020-04-18 05:00:00+00:00,1.892857,20.0,2020-04-18 07:00:00+00:00,NaT,,,,,...,,,,,,,,,,
1737,DOGE.X,2020-04-18 05:00:00+00:00,2.000000,1.0,2020-04-18 07:00:00+00:00,NaT,,,,,...,,,,,,,,,,
1738,BCH.X,2020-04-18 06:00:00+00:00,2.000000,2.0,2020-04-18 08:00:00+00:00,NaT,,,,,...,,,,,,,,,,


# Finish session

In [27]:
spark.stop()