In [1]:
%load_ext pycodestyle_magic
%pycodestyle_on
# %pycodestyle_off -to turn it off

In [3]:
# import all libraries
#from minio import Minio
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, month, year
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, LongType
import requests
import json
import pandas as pd
from binance import Client
from datetime import datetime, date
import os
from dotenv import load_dotenv, dotenv_values
from binance.helpers import date_to_milliseconds, interval_to_milliseconds
from dateutil.relativedelta import relativedelta
from dateutil.parser import parse
import time
load_dotenv()

True

2:1: E265 block comment should start with '# '
6:80: E501 line too long (87 > 79 characters)


In [11]:
import logging
from binance.exceptions import BinanceRequestException, BinanceAPIException

In [5]:
API_KEY = os.getenv("API_KEY")
SECRET_KEY = os.getenv("SECRET_KEY")
MINIO_USER = os.getenv("MINIO_ROOT_USER")
MINIO_PASSWORD = os.getenv("MINIO_ROOT_PASSWORD")
client_binance = Client(API_KEY, SECRET_KEY)

In [7]:
spark = SparkSession.builder \
    .appName("CryptoETL") \
    .getOrCreate()
# Get the SparkContext from the SparkSession
sc = spark.sparkContext
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", MINIO_USER)#turn into access key in the future 
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", MINIO_PASSWORD)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "http://localhost:9000")
sc._jsc.hadoopConfiguration().set("fs.s3a.connection.ssl.enabled", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.attempts.maximum", "1")
sc._jsc.hadoopConfiguration().set("fs.s3a.connection.establish.timeout", "5000")
sc._jsc.hadoopConfiguration().set("fs.s3a.connection.timeout", "10000")


'''
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.connection.ssl.enabled", "false")
sc._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
sc._jsc.hadoopConfiguration().set("fs.s3a.connection.ssl.enabled", "false")'''


'\nsc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")\nsc._jsc.hadoopConfiguration().set("fs.s3a.connection.ssl.enabled", "false")\nsc._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\nsc._jsc.hadoopConfiguration().set("fs.s3a.connection.ssl.enabled", "false")'

6:67: E261 at least two spaces before inline comment
6:67: E262 inline comment should start with '# '
6:80: E501 line too long (101 > 79 characters)
6:102: W291 trailing whitespace
12:80: E501 line too long (80 > 79 characters)
19:80: E501 line too long (90 > 79 characters)
21:1: W391 blank line at end of file
25/04/15 16:26:51 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [None]:
#read data for 1 crypto
#schema check/validation
#implement transformations
#add unique id for crypto,price,time
#split into 3 tables
#upload to postgres db

In [9]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG) #set to debug to capture all levels
if logger.hasHandlers():
    logger.handlers.clear()
logger.propagate = False


2:31: E261 at least two spaces before inline comment
2:32: E262 inline comment should start with '# '
6:1: W391 blank line at end of file


In [10]:
handler = logging.StreamHandler()
handler.setLevel(logging.DEBUG)
logger.addHandler(handler)

In [14]:
cryptos = ['BTCUSDT','ETHUSDT','LTCUSDT','BNBUSDT','DOGEUSDT']

def get_data_monthly(symbol,interval,start_date,end_date):
    output = []
    try:
        while end_date > start_date:
            data = client_binance.get_historical_klines(
            symbol=symbol, interval=interval, start_str=start_date, end_str=end_date, limit=1000 
            )
            output +=data
            #timeframe = interval_to_milliseconds(interval)
            if not data:
                logger.error(f"No data returned for {symbol} from {start_date} to {end_date}")
                return None
            last_date = data[-1][0]
            #get month and year from last date and compare with end_date month and year
            last_datetime = datetime.utcfromtimestamp(last_date / 1000)
            end_datetime = datetime.strptime(end_date, '%d-%m-%Y')
            #the below is to check last data point
            if last_datetime.year == end_datetime.year and last_datetime.month == end_datetime.month:
                break
            start_datetime =  last_datetime + relativedelta(months=1)
            start_date = str(start_datetime)
            logger.info(client_binance.response.headers['x-mbx-used-weight'])
            #for daily add sleep function
            if client_binance.response.headers['x-mbx-used-weight'] == 5000:
                time.sleep(10)
                logger.info("sleeping due to hitting rate limit")
        return output
    except BinanceAPIException as e:
        logger.error(f"order issue:{e}", stack_info=True, exc_info=True)
        return None
    except BinanceRequestException as e:
        logger.error(f"Network issue: {e}", stack_info=True, exc_info=True)
        return None
    except Exception as e:
        logger.error(e,stack_info=True,exc_info = True)
        return None
columns = [
    "Kline open time",
    "Open Price",
    "High price",
    "Low price",
    "Close Price",
    "Volume",
    "Kline Close time",
    "Quote asset volume",
    "Number of trades",
    "Taker buy base asset volume",
    "Taker buy quote asset volume",
    "Ignore",
]

def spark_df_monthly(output,columns):
    #implement schema check
    try:
        dataframe = spark.createDataFrame(output, columns)
        df = dataframe.withColumn("Kline open time", to_timestamp(col("Kline open time") / 1000))
        df = df.withColumnRenamed("Kline open time","datetime")
        df = df.select('datetime','Open Price','Close Price','Volume')
        df = df.withColumn("year", year(df["datetime"])) \
           .withColumn("month", month(df["datetime"]))
        logger.info(df.head(5))
        return df
    except Exception as e:
    logger.error(e,stack_info = True, exc_info = True)
    return None

def upload_minio(minio_url,minio_user,minio_password,symbol,bucket_name,timeframe,df):
    try:
        client_minio = Minio(
        #"localhost:9000",  # Make sure you're using port 9000 for the S3 API
        minio_url,
        access_key = minio_user,
        secret_key = minio_password,
        secure=False  # Disable SSL if you're not using SSL certificates
        )
        df.write.mode("append") \
        .partitionBy("year", "month") \
        .parquet(f"s3a://{bucket_name}/{symbol}/{timeframe}")
        return f"Successfully {symbol} {timeframe} uploaded to minio"
    except Exception as e:
        logger.error(e,stack_info = True, exc_info = True)
        
    

def minio_pipeline_monthly(symbol):
    columns = [
    "Kline open time",
    "Open Price",
    "High price",
    "Low price",
    "Close Price",
    "Volume",
    "Kline Close time",
    "Quote asset volume",
    "Number of trades",
    "Taker buy base asset volume",
    "Taker buy quote asset volume",
    "Ignore",
    ]
    data = get_data_monthly(symbol = symbol,interval = '1M',start_date = "01-01-2019",end_date = "31-12-2024")
    df_monthly = spark_df_monthly(data,columns)
    upload_minio(minio_url = 'localhost:9000',minio_user = MINIO_USER ,minio_password = MINIO_PASSWORD ,symbol = symbol ,bucket_name = 'binancedata',timeframe='Monthly',df = df_monthly)
    return "Successfully ran monthly pipeline to minio"
    #implement try and except, api codes from minio and binance
    #implement schema check
def main():
    cryptos = ['BTCUSDT','ETHUSDT','LTCUSDT','BNBUSDT','DOGEUSDT']
    for symbol in cryptos:
        try:
            minio_pipeline_monthly(symbol)
        except Exception as e:
            print(f"Error processing {symbol}: {e}")
if __name__ == "__main__":
    main()

1:21: E231 missing whitespace after ','
1:31: E231 missing whitespace after ','
1:41: E231 missing whitespace after ','
1:51: E231 missing whitespace after ','
3:1: E302 expected 2 blank lines, found 1
3:28: E231 missing whitespace after ','
3:37: E231 missing whitespace after ','
3:48: E231 missing whitespace after ','
8:13: E122 continuation line missing indentation or outdented
8:80: E501 line too long (96 > 79 characters)
8:97: W291 trailing whitespace
10:22: E225 missing whitespace around operator
11:13: E265 block comment should start with '# '
13:80: E501 line too long (94 > 79 characters)
16:13: E265 block comment should start with '# '
16:80: E501 line too long (87 > 79 characters)
19:13: E265 block comment should start with '# '
20:80: E501 line too long (101 > 79 characters)
22:29: E222 multiple spaces after operator
25:13: E265 block comment should start with '# '
37:23: E231 missing whitespace after ','
37:39: E231 missing whitespace after ','
37:48: E251 unexpected spaces

In [None]:
def get_data_monthly(symbol,interval,start_date,end_date):
    output = []
    try:
        while end_date > start_date:
            data = client_binance.get_historical_klines(
            symbol=symbol, interval=interval, start_str=start_date, end_str=end_date, limit=1000 
            )
            output +=data
            #timeframe = interval_to_milliseconds(interval)
            last_date = data[-1][0]
            #get month and year from last date and compare with end_date month and year
            last_datetime = datetime.utcfromtimestamp(last_date / 1000)
            end_datetime = datetime.strptime(end_date, '%d-%m-%Y')
            #the below is to check last data point
            if last_datetime.year == end_datetime.year and last_datetime.month == end_datetime.month:
                break
            start_datetime =  last_datetime + relativedelta(months=1)
            start_date = str(start_datetime)
            print(client_binance.response.headers['x-mbx-used-weight'])
            #for daily add sleep function
            if client_binance.response.headers['x-mbx-used-weight'] == 5000:
                time.sleep(10)
        return output
    except Exception as e:
        print("An exception occurred:", type(e).__name__, "–", e)