In [1]:
import findspark
findspark.init()
findspark.find()
import pyspark
import sys

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, array, explode, sum, count, lit
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from datetime import datetime

In [7]:
spark = SparkSession \
        .builder \
        .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:2.4.0') \
        .getOrCreate()

In [4]:
read_func = lambda x: spark.read.format('csv').load(x, header=True, inferSchema=True)

# Intermediate files

In [2]:
k_list = [2, 5, 12, 24, 48]

In [8]:
twoH_data = read_func("intermediate/2h.csv") 
fiveH_data = read_func("intermediate/5h.csv") 
twelveH_data = read_func("intermediate/12h.csv") 
twoFourH_data = read_func("intermediate/24h.csv") 
foureightH_data = read_func("intermediate/48h.csv") 

tweetsprice_df_list = [
    (2, twoH_data),
    (5, fiveH_data),
    (12, twelveH_data),
    (24, twoFourH_data),
    (48, foureightH_data)
]

# Cryptocurrency spec data

In [9]:
BTC_data = read_func("data/btc.csv") 
ETH_data = read_func("data/eth.csv") 
LTC_data = read_func("data/ltc.csv") 
BCH_data = read_func("data/bch.csv") 
DOGE_data = read_func("data/doge.csv") 

cryptospec_df_lists = [
    ('BTC.X', BTC_data),
    ('ETH.X', ETH_data),
    ('LTC.X', LTC_data),
    ('BCH.X', BCH_data),
    ('DOGE.X', DOGE_data)
]

In [10]:
# Columns that we are interested in and commonly refered in cryptocurrency data 
# Transaction count, volume, fees
# Network value, Realized cap 
# Active addresses: ADA.AdrActCnt
# Payment count, Average difficulty: DiffMean, Hash rate
# Block size, Block count, Current supply

common_col = ['date', 'AdrActCnt', 'TxCnt', 'TxTfrValAdjUSD', 'ROI30d', 'HashRate', 
              'BlkCnt', 'VtyDayRet180d', 'CapMrktCurUSD', 'SplyCur', 'ROI1yr',
              'BlkSizeMeanByte','VtyDayRet60d','VtyDayRet30d', 'FeeTotUSD',
              'DiffMean']

# make sure all the crypto_df has the columns before union
for sym, crypto_df in cryptospec_df_lists:
    common_col = list(set(common_col) & set(crypto_df.columns))

In [16]:
from functools import reduce
from pyspark.sql import DataFrame

def unionAll(*dfs):
    return reduce(DataFrame.unionAll, dfs)

In [17]:
crypto_dfs = []
for sym, crypto_df in cryptospec_df_lists:
    crypto_dfs.append(crypto_df \
                .select([col for col in crypto_df.columns if col in common_col])  
                .where(crypto_df['date'] > lit("2020-03-10")) 
                .withColumnRenamed('coin', 'symbol') \
                .withColumn('symbol', lit(sym)))
    
cryptos_df = unionAll(*crypto_dfs)        

In [18]:
def convert_symbol(symbol):
    if (symbol == 'BTC.X'):
        return 0
    elif (symbol == 'BCH.X'):
        return 1
    elif (symbol == 'LTC.X'):
        return 2
    elif (symbol == 'ETH.X'):
        return 3
    elif (symbol == 'DOGE.X'):
        return 4
    else:
        return None

symbolUdf = udf(convert_symbol, IntegerType())

In [20]:
tweetspricecrypto_df_list = []

for hour, tweetsprice in tweetsprice_df_list:
    merged_df = tweetsprice.join(cryptos_df, \
                               (tweetsprice.Symbol == cryptos_df.symbol)&(tweetsprice.Only_date == cryptos_df.date), \
                                how='inner') \
                           .drop(tweetsprice.Only_date) \
                           .drop(cryptos_df.date) \
                           .drop(cryptos_df.symbol) \
                           .withColumn('Symbol', symbolUdf('Symbol'))
    tweetspricecrypto_df_list.append(merged_df)

In [23]:
for index in range(len(k_list)):
    filename = 'output/' + str(k_list[index]) + "h.csv"
    tweetspricecrypto_df_list[index].coalesce(1).write.option("header", "true").csv(filename)