In [0]:
%pip install websockets nest_asyncio
%pip install asyncio

In [0]:
dbutils.library.restartPython()

In [0]:
import nest_asyncio
import asyncio
import json
import websockets
from datetime import datetime

nest_asyncio.apply()  # Allows nested event loops

async def get_btcusdt_prices(n=5):
    url = "wss://stream.binance.com:9443/ws/btcusdt@trade"
    rows = []

    async with websockets.connect(url) as websocket:
        for _ in range(n):
            msg = await websocket.recv()
            data = json.loads(msg)
            price = float(data['p'])
            timestamp = int(data['E'])
            system_time = datetime.fromtimestamp(timestamp / 1000.0)
            timestamp = int(data['T'])  # Trade time
            trade_time = datetime.fromtimestamp(timestamp / 1000.0)
            rows.append({
                "price": price,
                "trade_time": trade_time,
                "system_time": system_time
            })

    return rows

# Await the async function directly
btc_prices = await get_btcusdt_prices(5)
display(btc_prices)




In [0]:
import nest_asyncio
import asyncio
import json
import websockets
from datetime import datetime

nest_asyncio.apply()  # Allows nested event loops

async def get_raw_btcusdt_messages(n=5):
    url = "wss://stream.binance.com:9443/ws/btcusdt@trade"
    raw_messages = []

    async with websockets.connect(url) as websocket:
        for _ in range(n):
            msg = await websocket.recv()  # Keep raw message string
            data = json.loads(msg)
            data['e1'] = data.pop('e', None)
            data['e2'] = data.pop('E', None)
            data['t1'] = data.pop('t', None)
            data['t2'] = data.pop('T', None)
            data['m1'] = data.pop('m', None)
            data['m2'] = data.pop('M', None)
            raw_messages.append(data)  # Convert to dict for final json array

    return json.dumps(raw_messages, indent=2)

# Await the async function to get the full JSON string
raw_json = await get_raw_btcusdt_messages(5)

data_datetime = datetime.now()
data_year = data_datetime.year
data_month = f"{data_datetime.month:02d}"  # pad with 0 if needed
data_day = f"{data_datetime.day:02d}"      # pad with 0 if needed
data_timestamp = data_datetime.timestamp()

print(f"Raw trade data saved to {data_timestamp}")

# Save to DBFS using /dbfs mount
output_path = f"dbfs:/Volumes/workspace/ops/volumes/stockprices/{data_year}/{data_month}/{data_day}/{data_timestamp}/btc_raw_trades.json"

dbutils.fs.put(output_path, raw_json, overwrite=True)

print(f"Raw trade data saved to {output_path}")

output_path = f"dbfs:/Volumes/workspace/ops/volumes/ingest_stockprices/btc_raw_trades_{data_timestamp}.json"

dbutils.fs.put(output_path, raw_json, overwrite=True)

print(f"Raw trade data saved to {output_path}")


In [0]:
import nest_asyncio
import asyncio
import json
import websockets
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import when,col,current_timestamp

nest_asyncio.apply()

async def get_raw_btcusdt_messages(n=5):
    url = "wss://stream.binance.com:9443/ws/btcusdt@trade"
    raw_messages = []

    async with websockets.connect(url) as websocket:
        for _ in range(n):
            msg = await websocket.recv()
            data = json.loads(msg)
            # Rename fields
            data['e1'] = data.pop('e', None)
            data['e2'] = data.pop('E', None)
            data['t1'] = data.pop('t', None)
            data['t2'] = data.pop('T', None)
            data['m1'] = data.pop('m', None)
            data['m2'] = data.pop('M', None)
            raw_messages.append(data)

    return raw_messages  # return list of dicts, not JSON string

# Await the async function to get raw messages as list of dicts
raw_messages = await get_raw_btcusdt_messages(5)

# Prepare output path with integer timestamp
data_datetime = datetime.now()
data_year = data_datetime.year
data_month = f"{data_datetime.month:02d}"
data_day = f"{data_datetime.day:02d}"
data_timestamp = int(data_datetime.timestamp())

# Convert list of dicts to Spark DataFrame
df = spark.createDataFrame(raw_messages).withColumn("data_timestamp",current_timestamp())

# Optional: show schema and data
df.printSchema()
display(df)


json_str = df.toPandas().to_json(orient='records')

output_path = f"dbfs:/Volumes/workspace/ops/volumes/stockprices/{data_year}/{data_month}/{data_day}/{data_timestamp}/btc_raw_trades.json"


# Write JSON string to DBFS as a single file
dbutils.fs.put(output_path, json_str, overwrite=True)

print(f"Raw trade data saved to {output_path}")

output_path = f"dbfs:/Volumes/workspace/ops/volumes/ingest_stockprices/btc_raw_trades_{data_timestamp}.json"

dbutils.fs.put(output_path, json_str, overwrite=True)

print(f"Raw trade data saved to {output_path}")



In [0]:
from pyspark.sql.functions import when,col,current_timestamp

df=spark.sql(f'''
select
*,
current_timestamp() as processing_time,
_metadata.file_name as source_file,
_metadata.file_modification_time as source_file_modification_time,
_metadata.file_size as source_file_size,
_metadata.file_block_start as source_file_block_start,
_metadata.file_block_length as source_file_block_length,
_metadata.file_path as source_file_path


from read_files(
  "dbfs:/Volumes/workspace/ops/volumes" || "/orders",
  format => "json"
);          
          
          ''')

df=(
    df.withColumn(
        "TestOne",
        when(col("order_id")>=75129,"OK")
        .when((col("order_id")<75129) & (col("order_id")>=75125),"OK1")
        .otherwise("NOT OK")
    )
)

display(df)

df.write.format("delta").mode("append").save("dbfs:/Volumes/workspace/ops/volumes/resultOrders/2023-07-1/01")

In [0]:


from pyspark.sql.functions import when,col,current_timestamp, input_file_name, input_file_block_length, input_file_block_start 
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType, BooleanType

schema = StructType([
    StructField("e1", StringType(), True),
    StructField("e2", IntegerType(), True),
    StructField("s", StringType(), True),
    StructField("t1", IntegerType(), True),
    StructField("p", DoubleType(), True),
    StructField("q", DoubleType(), True),
    StructField("t2", IntegerType(), True),
    StructField("m1", BooleanType(), True),
    StructField("m2", BooleanType(), True)
])

df=spark.sql(f'''
select
*,
current_timestamp() as processing_time,
_metadata.file_name as source_file,
_metadata.file_modification_time as source_file_modification_time,
_metadata.file_size as source_file_size,
_metadata.file_block_start as source_file_block_start,
_metadata.file_block_length as source_file_block_length,
_metadata.file_path as source_file_path


from read_files(
  "dbfs:/Volumes/workspace/ops/volumes/ingest_stockprices/",
  format => "json"
);    
''')

df.createOrReplaceTempView("etl_bronze_stockprices")

spark.sql("select * from etl_bronze_stockprices").display()

spark.sql("drop temporary variable if exists test1;")
spark.sql("declare variable test1 int;")
spark.sql("set variable test1=10;")          
spark.sql("select test1").show()



#display(df)

#df.write.format("delta").mode("append").save("dbfs:/Volumes/workspace/ops/volumes/resultOrders/2023-07-1/01")


In [0]:
%sql

--drop temporary variable if exists test1;
--declare variable test1 int;

--set variable test1 = 10;
select test1;

--select * from etl_bronze_stockprices

In [0]:
source_path = "wasbs://courseware@dbacademy.blob.core.windows.net/data-engineer-learning-path/v03"
destination_path = "dbfs:/Volumes/workspace/ops/volumes/v03"

# Copy the entire directory recursively
dbutils.fs.cp(source_path, destination_path, recurse=True)