In [1]:
import sys,os,logging
import argparse


date_strftime_format = "%d-%b-%y %H:%M:%S"
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='[%(asctime)s] %(name)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s',)

parser = argparse.ArgumentParser()
parser.add_argument("--f", "--filepath", type=str, default="./conf/default.conf", help='provide configuration filepath')
args = parser.parse_args(args=['--filepath', './conf/default.conf'])
configFilePath = args.f

from configparser import ConfigParser
config_object = ConfigParser()
config_object.read(configFilePath)
scaleinfo = config_object["SCALEINFO"]
scale = scaleinfo.getint("scale_gb")
batch = scaleinfo.getint("batch_k")
timespan_days = scaleinfo.getint("timespan_days")
droptable = scaleinfo.getboolean("droptable")

logging.info("Using following scale configuration: ")
for (each_key, each_val) in config_object.items(config_object["SCALEINFO"].name):
    logging.info( each_key + ":" + each_val)

    
systeminfo = config_object["SYSTEMINFO"]
SPARK_APP_NAME = str(systeminfo.get("spark.app.name")).strip('\"')
SPARK_MASTER = str(systeminfo.get("spark.master.hostpath")).strip('\"')
HIVE_HMS_HOST= str(systeminfo.get("hive.metastore.uris")).strip('\"')
SPARK_WAREHOUSE_DIR = str(systeminfo.get("spark.sql.warehouse.dir")).strip('\"')
SPARK_DRIVER_CORES = systeminfo.getint("spark_driver_cores")
SPARK_DRIVER_MEMORY = str(systeminfo.get("spark.driver.memory")).strip('\"')
SPARK_EXECUTOR_CORES = systeminfo.getint("spark.executor.cores")
SPARK_DRIVER_MEMORY = str(systeminfo.get("spark.executor.memory")).strip('\"')

logging.info("Using following system configuration: ")
for (each_key, each_val) in config_object.items(config_object["SYSTEMINFO"].name):
    logging.info( each_key + ":" + each_val)

[2023-05-19 14:09:21,680] root {<ipython-input-1-0a2089590fb1>:22} INFO - Using following scale configuration: 
[2023-05-19 14:09:21,682] root {<ipython-input-1-0a2089590fb1>:24} INFO - scale_gb:1
[2023-05-19 14:09:21,684] root {<ipython-input-1-0a2089590fb1>:24} INFO - batch_k:100
[2023-05-19 14:09:21,687] root {<ipython-input-1-0a2089590fb1>:24} INFO - timespan_days:7
[2023-05-19 14:09:21,689] root {<ipython-input-1-0a2089590fb1>:24} INFO - droptable:True
[2023-05-19 14:09:21,690] root {<ipython-input-1-0a2089590fb1>:24} INFO - debugmode:True
[2023-05-19 14:09:21,692] root {<ipython-input-1-0a2089590fb1>:37} INFO - Using following system configuration: 
[2023-05-19 14:09:21,693] root {<ipython-input-1-0a2089590fb1>:39} INFO - spark.app.name:"Remote Spark"
[2023-05-19 14:09:21,695] root {<ipython-input-1-0a2089590fb1>:39} INFO - spark.master.hostpath:"spark://spark-master:7077"
[2023-05-19 14:09:21,697] root {<ipython-input-1-0a2089590fb1>:39} INFO - hive.metastore.uris:"thrift://hms-

In [2]:
import math
import pyspark
import pandas as pd
import numpy as np

from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.functions  import from_unixtime
from time import sleep

spark = SparkSession \
        .builder \
        .appName(SPARK_APP_NAME) \
        .master(SPARK_MASTER) \
        .config("hive.metastore.uris", HIVE_HMS_HOST) \
        .config("spark.sql.warehouse.dir", SPARK_WAREHOUSE_DIR) \
        .config("spark_driver_cores", SPARK_DRIVER_CORES) \
        .config("spark.driver.memory", SPARK_DRIVER_MEMORY) \
        .config("spark.executor.cores", SPARK_EXECUTOR_CORES) \
        .config("spark.executor.memory", SPARK_DRIVER_MEMORY) \
        .enableHiveSupport() \
        .getOrCreate()

spark.sparkContext.setLogLevel("INFO")
sqlContext = SQLContext(spark.sparkContext, sparkSession=spark)
spark.sparkContext.version
logging.info("Spark Version: " + spark.version)
logging.info("PySpark Version: " + pyspark.__version__)
logging.info("Pandas Version: " + pd.__version__)

[2023-05-19 14:09:22,882] numexpr.utils {utils.py:141} INFO - NumExpr defaulting to 8 threads.
[2023-05-19 14:09:37,014] root {<ipython-input-2-86992297f6c8>:26} INFO - Spark Version: 3.3.1
[2023-05-19 14:09:37,018] root {<ipython-input-2-86992297f6c8>:27} INFO - PySpark Version: 3.3.1
[2023-05-19 14:09:37,023] root {<ipython-input-2-86992297f6c8>:28} INFO - Pandas Version: 1.3.5


In [3]:
sqlContext.sql("use sample;")
df = sqlContext.sql("show tables;")
df.show(100,False)

+---------+-------------------+-----------+
|namespace|tableName          |isTemporary|
+---------+-------------------+-----------+
|sample   |tb_sev_u           |false      |
|sample   |tb_test            |false      |
|sample   |tb_test_num        |false      |
|sample   |tb_test_num_tmp    |false      |
|sample   |tb_test_qf_lastest |false      |
|sample   |tb_test_qf_stat    |false      |
|sample   |tb_test_qf_stat_log|false      |
+---------+-------------------+-----------+



In [4]:
df = sqlContext.sql("select * from sample.tb_test_qf_stat order by ad, bd")
df.show(100,False)

+---+-----------+--------------------+---+---+---+-------------+----------+
|f22|f02        |f16                 |cnt|f06|f07|bd           |ad        |
+---+-----------+--------------------+---+---+---+-------------+----------+
|3  |19218983881|Great you found me !|16 |0  |7  |2023-05-12-20|2023051712|
|3  |19218983887|Great you found me !|11 |0  |7  |2023-05-12-20|2023051712|
|3  |19218983880|Great you found me !|17 |0  |7  |2023-05-12-20|2023051712|
|3  |19218983885|Great you found me !|14 |0  |7  |2023-05-12-20|2023051712|
|3  |19218983882|Great you found me !|11 |0  |7  |2023-05-12-20|2023051712|
|3  |19218983883|Great you found me !|12 |0  |7  |2023-05-12-21|2023051712|
|3  |19218983887|Great you found me !|13 |0  |7  |2023-05-12-21|2023051712|
|3  |19218983885|Great you found me !|11 |0  |7  |2023-05-12-21|2023051712|
|3  |19218983881|Great you found me !|13 |0  |7  |2023-05-12-21|2023051712|
|3  |19218983889|Great you found me !|11 |0  |7  |2023-05-12-21|2023051712|
|3  |1921898

In [5]:
df = sqlContext.sql("select * from sample.tb_test_qf_lastest;")
df.show(100,False)

+---+-----------+--------------------+---+-------------+---+---+
|f22|f02        |f16                 |cnt|bd           |f06|f07|
+---+-----------+--------------------+---+-------------+---+---+
|3  |19218983880|Great you found me !|16 |2023-05-17-08|0  |7  |
|3  |19218983881|Great you found me !|13 |2023-05-17-09|0  |7  |
|3  |19218983882|Great you found me !|13 |2023-05-17-10|0  |7  |
|3  |19218983883|Great you found me !|15 |2023-05-17-11|0  |7  |
|3  |19218983884|Great you found me !|17 |2023-05-17-11|0  |7  |
|3  |19218983885|Great you found me !|15 |2023-05-17-11|0  |7  |
|3  |19218983886|Great you found me !|12 |2023-05-17-11|0  |7  |
|3  |19218983887|Great you found me !|11 |2023-05-17-10|0  |7  |
|3  |19218983888|Great you found me !|16 |2023-05-17-11|0  |7  |
|3  |19218983889|Great you found me !|13 |2023-05-17-11|0  |7  |
+---+-----------+--------------------+---+-------------+---+---+



In [6]:
df = sqlContext.sql(" select * from sample.tb_test_num; ")
df.show(100,False)

+---+-----------+-------------+-------------+---+---+
|f22|f02        |f_date       |l_date       |f06|f07|
+---+-----------+-------------+-------------+---+---+
|3  |19218983880|2023-05-12-20|2023-05-17-08|0  |7  |
|3  |19218983881|2023-05-12-20|2023-05-17-09|0  |7  |
|3  |19218983882|2023-05-12-20|2023-05-17-10|0  |7  |
|3  |19218983883|2023-05-12-21|2023-05-17-11|0  |7  |
|3  |19218983884|2023-05-12-22|2023-05-17-11|0  |7  |
|3  |19218983885|2023-05-12-20|2023-05-17-11|0  |7  |
|3  |19218983886|2023-05-12-22|2023-05-17-11|0  |7  |
|3  |19218983887|2023-05-12-20|2023-05-17-10|0  |7  |
|3  |19218983888|2023-05-12-22|2023-05-17-11|0  |7  |
|3  |19218983889|2023-05-12-21|2023-05-17-11|0  |7  |
+---+-----------+-------------+-------------+---+---+



In [7]:
spark.sparkContext.stop()