In [1]:
import json
import datetime as dt
import pandas as pd
from pyspark.sql.functions import col, monotonically_increasing_id, row_number
from pyspark.sql.window import Window
from pyspark.sql.types import *

In [2]:
JDBC = {
    'url':'jdbc:oracle:thin:@realestate_high?TNS_ADMIN=/home/big/study/db/Wallet_REALESTATE'
    ,'props':{
        'user':'dw_realestate',
        'password':'123qwe!@#QWE'
    }   
}

In [6]:
from enum import Enum

# 데이터웨어하우스 ENUM
class DataWarehouse(Enum):
    URL='jdbc:oracle:thin:@realestate_high?TNS_ADMIN=/home/big/study/db/Wallet_REALESTATE'
    PROPS={
        'user':'dw_realestate',
        'password':'123qwe!@#QWE'
    }

In [3]:
def cal_std_day(befor_day):   
    x = dt.datetime.now() - dt.timedelta(befor_day)
    year = x.year
    month = x.month if x.month >= 10 else '0'+ str(x.month)
    day = x.day if x.day >= 10 else '0'+ str(x.day)  
    return str(year) +str(month) +str(day)

In [4]:
# 데이터웨어하우스, 데이터마트에서 데이터 가져오기 위한 함수
def find_data(config, table_name):
    return spark.read.jdbc(url=config.URL.value,
                                        table=table_name,
                                        properties=config.PROPS.value)

In [7]:
df_loc = find_data(DataWarehouse, 'LOC')
loc_code = df_loc.select(['SIDO','LOC_CODE']).filter(df_loc.SIGUNGU.isNull()).collect()
df_loc_code = spark.createDataFrame(loc_code)
df_loc_code.show()

                                                                                

+--------------+--------+
|          SIDO|LOC_CODE|
+--------------+--------+
|    부산광역시|   26000|
|    대구광역시|   27000|
|    인천광역시|   28000|
|    광주광역시|   29000|
|    대전광역시|   30000|
|    울산광역시|   31000|
|세종특별자치시|   36110|
|        경기도|   41000|
|        강원도|   42000|
|      충청북도|   43000|
|      충청남도|   44000|
|      전라북도|   45000|
|      전라남도|   46000|
|      경상북도|   47000|
|      경상남도|   48000|
|    서울특별시|   11000|
|제주특별자치도|   50000|
+--------------+--------+



In [9]:
for i in range(13,14):
    path = '/realestate_data/local_foreigner_corp/local_foreigner_corp_data_'+cal_std_day(i)+'.json'
    tmp = spark.read.json(path, encoding='UTF-8')

                                                                                

In [10]:
tmp.printSchema()

root
 |-- result: struct (nullable = true)
 |    |-- head: struct (nullable = true)
 |    |    |-- returnCode: string (nullable = true)
 |    |    |-- returnMessage: string (nullable = true)
 |    |    |-- totalCount: string (nullable = true)
 |    |-- items: struct (nullable = true)
 |    |    |-- item: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- adminRegn1Name: string (nullable = true)
 |    |    |    |    |-- adminRegn2Name: string (nullable = true)
 |    |    |    |    |-- enrNoCls: string (nullable = true)
 |    |    |    |    |-- resDate: string (nullable = true)
 |    |    |    |    |-- tot: string (nullable = true)



In [20]:
tmp2 = tmp.select('result').first()
df = spark.createDataFrame(tmp2)
tmp3 = df.select('items').first()
tmp4 = spark.createDataFrame(tmp3).first()
df2 = spark.createDataFrame(tmp4['item'])
df2.show()

                                                                                

+--------------+--------------+--------+----------+---+
|adminRegn1Name|adminRegn2Name|enrNoCls|   resDate|tot|
+--------------+--------------+--------+----------+---+
|    서울특별시|              |  내국인|2022-09-22|261|
|    서울특별시|              |  외국인|2022-09-22|  3|
|    서울특별시|              |    법인|2022-09-22| 68|
|    서울특별시|              |  비법인|2022-09-22|  3|
|    부산광역시|              |  내국인|2022-09-22|183|
|    부산광역시|              |    법인|2022-09-22| 43|
|    부산광역시|              |  비법인|2022-09-22|  8|
|    대구광역시|              |  내국인|2022-09-22|104|
|    대구광역시|              |    법인|2022-09-22|  4|
|    인천광역시|              |  내국인|2022-09-22|211|
|    인천광역시|              |  외국인|2022-09-22|  5|
|    인천광역시|              |    법인|2022-09-22| 28|
|    인천광역시|              |  비법인|2022-09-22|  4|
|    광주광역시|              |  내국인|2022-09-22| 70|
|    광주광역시|              |  외국인|2022-09-22|  1|
|    광주광역시|              |    법인|2022-09-22| 20|
|    대전광역시|              |  내국인|2022-09-22| 57|
|    대전광역시|

OT_IDX 

RES_DATE

RES_REGN_CODE

OWNER_CLS

TOT

In [21]:
df_type = df2.select(df2.adminRegn1Name.alias('SIDO'),df2.resDate.alias('RES_DATE'),df2.enrNoCls.alias('OWNER_CLS'),df2.tot.alias('TOT'))
df_type.show()

[Stage 13:>                                                         (0 + 1) / 1]                                                                                

+----------+----------+---------+---+
|      SIDO|  RES_DATE|OWNER_CLS|TOT|
+----------+----------+---------+---+
|서울특별시|2022-09-22|   내국인|261|
|서울특별시|2022-09-22|   외국인|  3|
|서울특별시|2022-09-22|     법인| 68|
|서울특별시|2022-09-22|   비법인|  3|
|부산광역시|2022-09-22|   내국인|183|
|부산광역시|2022-09-22|     법인| 43|
|부산광역시|2022-09-22|   비법인|  8|
|대구광역시|2022-09-22|   내국인|104|
|대구광역시|2022-09-22|     법인|  4|
|인천광역시|2022-09-22|   내국인|211|
|인천광역시|2022-09-22|   외국인|  5|
|인천광역시|2022-09-22|     법인| 28|
|인천광역시|2022-09-22|   비법인|  4|
|광주광역시|2022-09-22|   내국인| 70|
|광주광역시|2022-09-22|   외국인|  1|
|광주광역시|2022-09-22|     법인| 20|
|대전광역시|2022-09-22|   내국인| 57|
|대전광역시|2022-09-22|     법인|  3|
|울산광역시|2022-09-22|   내국인| 47|
|울산광역시|2022-09-22|     법인|  3|
+----------+----------+---------+---+
only showing top 20 rows



In [22]:
own_type = df_type.join(df_loc_code, on='SIDO')
own_type = own_type.select(col('LOC_CODE').alias('RES_REGN_CODE'),col('TOT').cast('int'),col('OWNER_CLS'),col('RES_DATE').cast(DateType()))
own_type = own_type.withColumn('OT_IDX', row_number().over(Window.orderBy(monotonically_increasing_id())))
own_type.show()

                                                                                

+-------------+----+------------------------+----------+------+
|RES_REGN_CODE| TOT|               OWNER_CLS|  RES_DATE|OT_IDX|
+-------------+----+------------------------+----------+------+
|        42000| 577|                  내국인|2022-09-22|     1|
|        42000|   4|                  외국인|2022-09-22|     2|
|        42000|  32|                    법인|2022-09-22|     3|
|        42000|   1|                  비법인|2022-09-22|     4|
|        41000|   1|                    기타|2022-09-22|     5|
|        41000|1422|                  내국인|2022-09-22|     6|
|        41000|   9|                  외국인|2022-09-22|     7|
|        41000| 191|                    법인|2022-09-22|     8|
|        41000|   1|국가기관 및 지방자치단체|2022-09-22|     9|
|        48000| 351|                  내국인|2022-09-22|    10|
|        48000|  34|                    법인|2022-09-22|    11|
|        47000| 304|                  내국인|2022-09-22|    12|
|        47000|  75|                    법인|2022-09-22|    13|
|        47000|  1

In [24]:
own_type.write.jdbc(url=JDBC['url'], table='OWN_TYPE', mode='append', properties=JDBC['props'])

                                                                                