In [None]:
%load_ext pycodestyle_magic
%pycodestyle_on
# %pycodestyle_off -to turn it off

In [1]:
from minio import Minio
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, TimestampNTZType, DecimalType
from binance import Client
import os
from dotenv import load_dotenv, dotenv_values
import logging
import sys
import numpy
sys.path.insert(0,'./pipelines/shared')
from utils import (
    parquet_to_df,
    data_cleaning,
    add_crypto_id,
    add_time_id,
    upload_time,
    upload_price,
)
import pyspark.pandas as ps
load_dotenv(dotenv_path="./pipelines/shared/main.env", override=True)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/21 13:03:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


True

In [2]:
API_KEY = os.getenv("API_KEY")
SECRET_KEY = os.getenv("SECRET_KEY")
MINIO_USER = os.getenv("MINIO_ROOT_USER")
MINIO_PASSWORD = os.getenv("MINIO_ROOT_PASSWORD")
client_binance = Client(API_KEY, SECRET_KEY)

In [3]:
spark = SparkSession.builder \
    .appName("CryptoETL") \
    .config("spark.jars", "/Users/hamza/Desktop/projects/postgresql-42.7.5.jar") \
    .getOrCreate()
# Get the SparkContext from the SparkSession
sc = spark.sparkContext
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", MINIO_USER)#turn into access key in the future 
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", MINIO_PASSWORD)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "http://localhost:9000")
sc._jsc.hadoopConfiguration().set("fs.s3a.connection.ssl.enabled", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.attempts.maximum", "1")
sc._jsc.hadoopConfiguration().set("fs.s3a.connection.establish.timeout", "5000")
sc._jsc.hadoopConfiguration().set("fs.s3a.connection.timeout", "10000")

25/05/21 13:03:44 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG) #set to debug to capture all levels
if logger.hasHandlers():
    logger.handlers.clear()
logger.propagate = False

In [5]:
handler = logging.StreamHandler()
handler.setLevel(logging.DEBUG)
logger.addHandler(handler)

In [6]:
client_minio = Minio(
    "localhost:9000",  # Make sure you're using port 9000 for the S3 API
    # minio_url,
    access_key=MINIO_USER,
    secret_key=MINIO_PASSWORD,
    secure=False,  # Disable SSL if you're not using SSL certificates
)

schema = StructType(
    [
        StructField(name="datetime", dataType=TimestampNTZType(), nullable=False),
        StructField(name="Open Price", dataType=DecimalType(), nullable=False),
        StructField(name="Close Price", dataType=DecimalType(), nullable=False),
        StructField(name="Volume", dataType=DecimalType(), nullable=False),
    ]
)
# doesnt work for parquet files, schema inferred from that instead


read_sql = "SELECT * FROM crypto"
#localhost for local development and container ip for containers
df_crypto = (
    spark.read.format("jdbc")
    .option("url", "jdbc:postgresql://localhost:5432/crypto")
    .option("user", "postgres")
    .option("password", "postgres")
    .option("query", read_sql)
    .option("driver", "org.postgresql.Driver")
    .load()
)

25/05/21 13:03:47 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [61]:
df = parquet_to_df(client_minio=client_minio, timeframe="monthly", crypto="BTCUSDT", schema=schema)
df_cleaned = data_cleaning(df)

In [26]:
#standard deviation - 1 year volatility, all time volatility and rolling volatility
#moving averages
#percentage changes in price and volume
#drawdown
#correlation
#

In [62]:
ps.set_option("compute.ops_on_diff_frames", True)

In [63]:
psdf = df_cleaned.pandas_api()

In [64]:
monthly_pct_change = psdf['close'].pct_change()
monthly_pct_change = monthly_pct_change.round(3)

In [65]:
psdf['monthly_pct_change'] = monthly_pct_change

In [66]:
std = psdf['monthly_pct_change'].rolling(window=12).std()

In [67]:
psdf['rolling_std_12_months'] = std

In [68]:
sma = psdf['monthly_pct_change'].rolling(window=12).mean()

In [69]:
psdf['sma'] = sma

In [71]:
psdf

Unnamed: 0,datetime,open,close,volume,monthly_pct_change,rolling_std_12_months,sma
0,2019-01-01 00:00:00,3701.23,3434.1,908244.14054,,,
1,2019-10-01 01:00:00,8289.97,9140.85,1446763.02405,1.662,,0.1003333
2,2019-11-01 00:00:00,9140.86,7541.89,1499118.775,-0.175,,0.0485833
3,2019-12-01 00:00:00,7540.63,7195.23,1307033.02038,-0.046,,0.1825833
4,2019-02-01 00:00:00,3434.1,3813.69,861783.98673,-0.47,,-0.0261667
5,2019-03-01 00:00:00,3814.26,4103.95,787190.48925,0.076,,-0.0475833
6,2019-04-01 01:00:00,4102.44,5320.81,1126961.3151,0.297,,0.1199167
7,2019-05-01 01:00:00,5321.94,8555.0,1498410.02562,0.608,,
8,2019-06-01 01:00:00,8555.0,10854.1,1689489.64733,0.269,,0.1738333
9,2019-07-01 01:00:00,10854.1,10080.53,1886176.06592,-0.071,,0.0343333
