In [30]:
import findspark
findspark.init()


In [31]:
from pyspark.conf import SparkConf
config = SparkConf()
# config.set("property", "value")
config.setMaster("local").setAppName("Stocks")

from pyspark.sql import SparkSession
# spark Session, entry point for Spark SQL, DataFrame
spark = SparkSession.builder\
                    .config(conf=config)\
                    .getOrCreate()

sc = spark.sparkContext

In [38]:
from pyspark.sql.types import StructType, LongType, StringType, IntegerType, DoubleType

dailySchema = StructType()\
                .add("symbol", StringType(), True)\
                .add("series", StringType(), True)\
                .add("open", IntegerType(), True)\
                .add("high", IntegerType(), True)\
                .add("low", IntegerType(), True)\
                .add("close", IntegerType(), True)\
                .add("last", IntegerType(), True)\
                .add("prev_close", IntegerType(), True)\
                .add("time", IntegerType(), True)\
                .add("total_trades", IntegerType(), True)\
                .add("isin", IntegerType(), True)\
                .add("c_13", IntegerType(), True)

In [39]:
stockDailyDf = spark.read.format("csv")\
                .option("header", True)\
                .schema(dailySchema)\
                .load("hdfs://localhost:9000/stocks/daily")

stockDailyDf.printSchema()
stockDailyDf.show(50)
            

root
 |-- symbol: string (nullable = true)
 |-- series: string (nullable = true)
 |-- open: integer (nullable = true)
 |-- high: integer (nullable = true)
 |-- low: integer (nullable = true)
 |-- close: integer (nullable = true)
 |-- last: integer (nullable = true)
 |-- prev_close: integer (nullable = true)
 |-- time: integer (nullable = true)
 |-- total_trades: integer (nullable = true)
 |-- isin: integer (nullable = true)
 |-- c_13: integer (nullable = true)

+----------+------+-----+-----+-----+-----+-----+----------+--------+------------+----+------+
|    symbol|series| open| high|  low|close| last|prev_close|    time|total_trades|isin|  c_13|
+----------+------+-----+-----+-----+-----+-----+----------+--------+------------+----+------+
| 20MICRONS|    EQ| null| null| null| null| null|      null|  219912|        null|null|  2642|
|21STCENMGM|    EQ| null| null| null| null| null|      null|    1209|        null|null|    45|
| 3IINFOLTD|    EQ| null| null| null| null| null|      null

22/05/12 00:10:17 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 14, schema size: 12
CSV file: hdfs://localhost:9000/stocks/daily/cm02MAR2022bhav.csv


In [13]:
stockSectorsDf = spark.read.format("csv")\
                .option("header", True)\
                .option("inferSchema", True)\
                .load("hdfs://localhost:9000/stocks/sectors")

#stockSectorsDf.printSchema()

stockSectorsDf.show(4)

+--------------------+------------------+----------+------+------------+
|        Company Name|          Industry|    Symbol|Series|   ISIN Code|
+--------------------+------------------+----------+------+------------+
|      Axis Bank Ltd.|FINANCIAL SERVICES|  AXISBANK|    EQ|INE238A01034|
|  Bajaj Finance Ltd.|FINANCIAL SERVICES|BAJFINANCE|    EQ|INE296A01024|
|  Bajaj Finserv Ltd.|FINANCIAL SERVICES|BAJAJFINSV|    EQ|INE918I01018|
|Cholamandalam Inv...|FINANCIAL SERVICES|  CHOLAFIN|    EQ|INE121A01024|
+--------------------+------------------+----------+------+------------+
only showing top 4 rows



In [27]:
stocksDf = stockSectorsDf.join(stockDailyDf, stockSectorsDf.Symbol == stockDailyDf.SYMBOL)\
                         .select(stockDailyDf.SYMBOL, "OPEN",  "Industry")
                         #.sort(desc("OPEN"))
stocksDf.show(5)

+----------+-------+----------+
|    SYMBOL|   OPEN|  Industry|
+----------+-------+----------+
|ABBOTINDIA|17500.0|    PHARMA|
|  ADANIENT| 1638.0|    METALS|
|     ALKEM| 3239.0|    PHARMA|
|     ALKEM| 3239.0|    PHARMA|
|AMARAJABAT|  560.0|AUTOMOBILE|
+----------+-------+----------+
only showing top 5 rows

