In [1]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

In [2]:
from pyspark.sql.functions import to_timestamp, col, concat, lit, from_json, schema_of_json, flatten, json_tuple
from pyspark.sql.types import *

In [44]:
from datetime import datetime, timedelta

data = spark.read.json('hdfs://namenode:9000/project20221-real/raw/stock_price_realtime/01-28-2023-17-57-11.json')

# data.show()


def fill_data(column, df, data_type):
      if not column in df.columns:
        ret = lit(None).cast(data_type)
      else:
        ret = col(column).cast(data_type)

      return ret

data = (data
        .withColumn('stockSymbol', fill_data('stockSymbol', data, StringType()))
        .withColumn('exchange', fill_data('exchange', data, StringType()))
        .withColumn('priceChange', fill_data('priceChange', data, DoubleType()))
        .withColumn('priceChangePercent', fill_data('priceChangePercent', data, DoubleType()))
        .withColumn('nmTotalTradedQty', fill_data('nmTotalTradedQty', data, LongType()))
        .withColumn('best1Bid', fill_data('best1Bid', data, DoubleType()))
        .withColumn('best5Bid', fill_data('best5Bid', data, DoubleType()))
        .withColumn('best2Bid', fill_data('best2Bid', data, DoubleType()))
        .withColumn('best3Bid', fill_data('best3Bid', data, DoubleType()))
        .withColumn('best1Offer', fill_data('best1Offer', data, DoubleType()))
        .withColumn('best2Offer', fill_data('best2Offer', data, DoubleType()))
        .withColumn('best3Offer', fill_data('best3Offer', data, DoubleType()))
        .withColumn('lowest', fill_data('lowest', data, DoubleType()))
        .withColumn('highest', fill_data('highest', data, DoubleType()))
        .withColumn('refPrice', fill_data('refPrice', data, DoubleType()))
        .withColumn('floor', fill_data('floor', data, DoubleType()))
        .withColumn('ceiling', fill_data('ceiling', data, DoubleType()))
        .withColumn('matchedPrice', fill_data('matchedPrice', data, DoubleType()))
        .withColumn('best1BidVol', fill_data('best1BidVol', data, LongType()))
        .withColumn('best2BidVol', fill_data('best2BidVol', data, LongType()))
        .withColumn('best3BidVol', fill_data('best3BidVol', data, LongType()))
        .withColumn('best1OfferVol', fill_data('best1OfferVol', data, LongType()))
        .withColumn('best2OfferVol', fill_data('best2OfferVol', data, LongType()))
        .withColumn('best3OfferVol', fill_data('best3OfferVol', data, LongType()))
        .withColumn('matchedVolume', fill_data('matchedVolume', data, LongType()))
        .withColumn('currentBidQty', fill_data('currentBidQty', data, DoubleType()))
        .withColumn('currentOfferQty', fill_data('currentOfferQty', data, DoubleType()))
        .withColumn('session', fill_data('session', data, StringType()))
        .withColumn('stockType', fill_data('stockType', data, StringType()))
        )


data = (data
        .na.fill(value = 0, subset = ([
            'nmTotalTradedQty',
            'best1BidVol',
            'best2BidVol',
            'best3BidVol',
            'best1OfferVol',
            'best2OfferVol',
            'best3OfferVol',
            'matchedVolume',
            'currentBidQty',
        ]))
        .na.fill(value = 0.0, subset = ([
            'priceChange',
            'priceChangePercent',
            'best1Bid',
            'best2Bid',
            'best3Bid',
            'best1Offer',
            'best2Offer',
            'best3Offer',
            'lowest',
            'highest',
            'refPrice',
            'floor',
            'ceiling',
            'matchedPrice',
            'currentBidQty',
            'currentOfferQty',
            'best5Bid',
        ]))
        .na.fill(value = 'undefined', subset = ['stockSymbol', 'exchange', 'session', 'stockType'])
       )
data.printSchema()
data.show()

root
 |-- __typename: string (nullable = true)
 |-- avgPrice: double (nullable = true)
 |-- best1Bid: double (nullable = false)
 |-- best1BidVol: long (nullable = true)
 |-- best1Offer: double (nullable = false)
 |-- best1OfferVol: long (nullable = true)
 |-- best2Bid: double (nullable = false)
 |-- best2BidVol: long (nullable = true)
 |-- best2Offer: double (nullable = false)
 |-- best2OfferVol: long (nullable = true)
 |-- best3Bid: double (nullable = false)
 |-- best3BidVol: long (nullable = true)
 |-- best3Offer: double (nullable = false)
 |-- best3OfferVol: long (nullable = true)
 |-- buyForeignQtty: long (nullable = true)
 |-- caStatus: string (nullable = true)
 |-- ceiling: double (nullable = false)
 |-- exchange: string (nullable = false)
 |-- floor: double (nullable = false)
 |-- highest: double (nullable = false)
 |-- lowest: double (nullable = false)
 |-- matchedPrice: double (nullable = false)
 |-- matchedVolume: long (nullable = true)
 |-- nmTotalTradedQty: long (nullable =

In [24]:
columns = ["Seqno","Name"]
data = [("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders")]

df = spark.createDataFrame(data=data,schema=columns)
df.show()


def upperCase(str):
    return str + 1

upperCaseUDF = udf(lambda z:upperCase(z),LongType())    


df.withColumn("Cureated Name", upperCaseUDF(col("Seqno"))) \
.show(truncate=False)

+-----+------------+
|Seqno|        Name|
+-----+------------+
|    1|  john jones|
|    2|tracey smith|
|    3| amy sanders|
+-----+------------+



PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/tmp/ipykernel_1910/4222292252.py", line 13, in <lambda>
  File "/tmp/ipykernel_1910/4222292252.py", line 11, in upperCase
TypeError: can only concatenate str (not "int") to str


In [53]:
hnx = spark.read.json("hdfs://namenode:9000/project20221/clean/stock_price_realtime/hnx/01-14-2023/01-14-2023-17-34-04.json")
hnx.select(col('best1Bid')).show()

+--------+
|best1Bid|
+--------+
|   59000|
|   73000|
|   66000|
|  495000|
|  500000|
|       0|
|       0|
|  169000|
|  133000|
|   98000|
|  230000|
|       0|
|  180000|
|  212000|
|       0|
|       0|
|   51000|
|  179000|
|       0|
|       0|
+--------+
only showing top 20 rows



In [62]:
a = spark.read.option("multiLine", "true").json('hdfs://namenode:9000/project20221-real/clean/stock_price_realtime/hnx/01-25-2023/01-25-2023-08-21-39.json')
a.printSchema()

AnalysisException: Unable to infer schema for JSON. It must be specified manually.