In [1]:
import pyspark
from pyspark.sql import SparkSession


conf = pyspark.SparkConf().setAppName("gen").\
        setMaster("spark://master:7077").\
        set("spark.executor.instances", "2") 
spark = SparkSession.builder.config(conf=conf).getOrCreate()

spark


24/06/24 05:22:05 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
from pyspark.sql.functions import current_date, current_timestamp, \
    date_add, date_sub, datediff, months_between, to_date, lit, to_timestamp

In [9]:
dateDF = spark.range(10).\
    withColumn("today",current_date()).\
    withColumn("now",current_timestamp())

In [10]:
dateDF.explain()

== Physical Plan ==
*(1) Project [id#72L, 2024-06-24 AS today#74, 2024-06-24 05:36:24.455507 AS now#77]
+- *(1) Range (0, 10, step=1, splits=4)




In [12]:
dateDF.select(date_add("today", 100)).show()

+--------------------+
|date_add(today, 100)|
+--------------------+
|          2024-10-02|
|          2024-10-02|
|          2024-10-02|
|          2024-10-02|
|          2024-10-02|
|          2024-10-02|
|          2024-10-02|
|          2024-10-02|
|          2024-10-02|
|          2024-10-02|
+--------------------+



In [17]:
dateDF.select("today", date_add("today", 100).alias("100일"), date_sub("today", 100).alias("-100일")).show()

+----------+----------+----------+
|     today|     100일|    -100일|
+----------+----------+----------+
|2024-06-24|2024-10-02|2024-03-16|
|2024-06-24|2024-10-02|2024-03-16|
|2024-06-24|2024-10-02|2024-03-16|
|2024-06-24|2024-10-02|2024-03-16|
|2024-06-24|2024-10-02|2024-03-16|
|2024-06-24|2024-10-02|2024-03-16|
|2024-06-24|2024-10-02|2024-03-16|
|2024-06-24|2024-10-02|2024-03-16|
|2024-06-24|2024-10-02|2024-03-16|
|2024-06-24|2024-10-02|2024-03-16|
+----------+----------+----------+



In [13]:
dateDF.select(date_sub("today", 100)).show()

+--------------------+
|date_sub(today, 100)|
+--------------------+
|          2024-03-16|
|          2024-03-16|
|          2024-03-16|
|          2024-03-16|
|          2024-03-16|
|          2024-03-16|
|          2024-03-16|
|          2024-03-16|
|          2024-03-16|
|          2024-03-16|
+--------------------+



In [14]:
dateDF.printSchema()

root
 |-- id: long (nullable = false)
 |-- today: date (nullable = false)
 |-- now: timestamp (nullable = false)



In [18]:
dateDF.withColumn("week_ago", date_sub("today", 7)).\
            select(datediff("week_ago", "today")).show()


+-------------------------+
|datediff(week_ago, today)|
+-------------------------+
|                       -7|
|                       -7|
|                       -7|
|                       -7|
|                       -7|
|                       -7|
|                       -7|
|                       -7|
|                       -7|
|                       -7|
+-------------------------+



In [24]:
dateDF.select(to_date(lit('2024-04-01')).alias('start'), 
             to_date(lit('2024-09-27')).alias('end')).\
        select(months_between("end", "start"), datediff("end", "start")).show()

+--------------------------------+--------------------+
|months_between(end, start, true)|datediff(end, start)|
+--------------------------------+--------------------+
|                      5.83870968|                 179|
|                      5.83870968|                 179|
|                      5.83870968|                 179|
|                      5.83870968|                 179|
|                      5.83870968|                 179|
|                      5.83870968|                 179|
|                      5.83870968|                 179|
|                      5.83870968|                 179|
|                      5.83870968|                 179|
|                      5.83870968|                 179|
+--------------------------------+--------------------+



In [25]:
dateDF.select(to_date(lit('2024-04-01')).alias('start'), 
             to_date(lit('2024-04-03')).alias('end')).\
        select(months_between("end", "start"), datediff("end", "start")).show()

+--------------------------------+--------------------+
|months_between(end, start, true)|datediff(end, start)|
+--------------------------------+--------------------+
|                      0.06451613|                   2|
|                      0.06451613|                   2|
|                      0.06451613|                   2|
|                      0.06451613|                   2|
|                      0.06451613|                   2|
|                      0.06451613|                   2|
|                      0.06451613|                   2|
|                      0.06451613|                   2|
|                      0.06451613|                   2|
|                      0.06451613|                   2|
+--------------------------------+--------------------+



In [31]:
dateFormat = "yyyy-dd-MM"
cleanDataDF = spark.range(1).select(to_date(lit('2024-12-11'),dateFormat)).show()

+-------------------------------+
|to_date(2024-12-11, yyyy-dd-MM)|
+-------------------------------+
|                     2024-11-12|
+-------------------------------+



In [32]:
# 날짜 데이터 파싱 
dateFormat = "yy/MM/dd"
cleanDataDF = spark.range(1).select(to_date(lit('24/06/24'), dateFormat)).show()


+---------------------------+
|to_date(24/06/24, yy/MM/dd)|
+---------------------------+
|                 2024-06-24|
+---------------------------+



In [65]:
import requests
import xml.etree.ElementTree as ET

url = "http://openapi.molit.go.kr/OpenAPI_ToolInstallPackage/service/rest/RTMSOBJSvc/getRTMSDataSvcAptTradeDev?serviceKey=34lIyWAciAsJlGtIZ4ltpLy2sLZDR%2BBRWvAv8RgoADNEd%2BKCgHe84XiSRUwL8JMMIubzsFW3ddjcNlhZHhvJIQ%3D%3D&pageNo=1&numOfRows=1000&LAWD_CD=11680&DEAL_YMD=202405"

column_list = []
data_list = []
check = 0
r = requests.get(url)
root = ET.fromstring(r.content)
for item in root.iter('item'):
    tmp = []
    if check==0:
        for child in item:
            # print(child.tag, child.text)
            column_list.append(child.tag)
            tmp.append(child.text)
            check=1
    else:
        for child in item:
            # print(child.tag, child.text)
            # column_list.append(child.tag)
            tmp.append(child.text)
            
        
    data_list.append(tmp)
import pandas as pd
df = pd.DataFrame(data_list,columns=column_list)        
    

In [69]:
pip install pandas

Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting tzdata>=2022.7
  Downloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.4/345.4 KB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
Collecting numpy>=1.22.4
  Downloading numpy-2.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.3/19.3 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: tzdata, numpy, pandas
Successfully installed numpy-2.0.0 pandas-2.2.2 tzdata-2024.1
Note: you may need to restart the kernel to use updated packages.


In [77]:
import xml.etree.ElementTree as ET
root = ET.fromstring(r.text)
items = root.iter(tag='item')
# for x in items:
#     for y in x:
#         print(y.tag, y.text)

apt =pd.DataFrame([{y.tag : y.text for y in x} for x in items])

In [79]:
apt.shape

(266, 32)

In [80]:
apt_spark_df = spark.createDataFrame(apt)

In [81]:
apt_spark_df.limit(3).show(truncate=False)

24/06/24 06:44:00 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 18:>                                                         (0 + 1) / 1]

+----------+--------+--------+----+------+--------------------+--------------------+----------------+------------------+------------------+----------+---+--------+------+------+-------+--------------+--------------+----------------+----------------+--------------+------------+---+---+----------+--------+------------+------+--------+---+--------------+--------+
|거래금액  |거래유형|건축년도|년  |도로명|도로명건물본번호코드|도로명건물부번호코드|도로명시군구코드|도로명일련번호코드|도로명지상지하코드|도로명코드|동 |등기일자|매도자|매수자|법정동 |법정동본번코드|법정동부번코드|법정동시군구코드|법정동읍면동코드|법정동지번코드|아파트      |월 |일 |일련번호  |전용면적|중개사소재지|지번  |지역코드|층 |해제사유발생일|해제여부|
+----------+--------+--------+----+------+--------------------+--------------------+----------------+------------------+------------------+----------+---+--------+------+------+-------+--------------+--------------+----------------+----------------+--------------+------------+---+---+----------+--------+------------+------+--------+---+--------------+--------+
|   143,000|중개거래|1997    |2024|언주로|00316               |00000     

                                                                                

In [82]:
apt['월'] + "-" + apt["일"]

0       5-1
1       5-3
2       5-3
3       5-3
4       5-3
       ... 
261    5-28
262    5-29
263    5-29
264    5-30
265    5-30
Length: 266, dtype: object

In [84]:
apt["날짜"] = apt['월'].apply(lambda x : "{:02d}".format(int(x))) + "-" + apt['일']

In [85]:
apt['날짜'] = apt[['월', '일']].apply(lambda x : "2024-" + "{:02d}".format(int(x[0])) + "-" + "{:02d}".format(int(x[1])) ,axis=1)

  apt['날짜'] = apt[['월', '일']].apply(lambda x : "2024-" + "{:02d}".format(int(x[0])) + "-" + "{:02d}".format(int(x[1])) ,axis=1)


In [94]:
from pyspark.sql.functions import concat, format_string,col
apt_spark_df2 = apt_spark_df.withColumn("날짜", concat(lit('2024-'), 
                        format_string("%02d", col("월").cast('int')), lit("-"), 
                        format_string("%02d", col("일").cast('int')))).limit(3).show()


+----------+--------+--------+----+------+--------------------+--------------------+----------------+------------------+------------------+----------+---+--------+------+------+-------+--------------+--------------+----------------+----------------+--------------+------------+---+---+----------+--------+------------+------+--------+---+--------------+--------+----------+
|  거래금액|거래유형|건축년도|  년|도로명|도로명건물본번호코드|도로명건물부번호코드|도로명시군구코드|도로명일련번호코드|도로명지상지하코드|도로명코드| 동|등기일자|매도자|매수자| 법정동|법정동본번코드|법정동부번코드|법정동시군구코드|법정동읍면동코드|법정동지번코드|      아파트| 월| 일|  일련번호|전용면적|중개사소재지|  지번|지역코드| 층|해제사유발생일|해제여부|      날짜|
+----------+--------+--------+----+------+--------------------+--------------------+----------------+------------------+------------------+----------+---+--------+------+------+-------+--------------+--------------+----------------+----------------+--------------+------------+---+---+----------+--------+------------+------+--------+---+--------------+--------+----------+
|   143,000|중개거래|    1997|2024|언주로|

In [97]:
from pyspark.sql.functions import concat, format_string,col
apt_spark_df2 = apt_spark_df.withColumn("날짜", concat(lit('2024-'), 
                        format_string("%02d", col("월").cast('int')), lit("-"), 
                        format_string("%02d", col("일").cast('int'))))


In [98]:
apt_spark_df2.explain()

== Physical Plan ==
*(1) Project [거래금액#243, 거래유형#244, 건축년도#245, 년#246, 도로명#247, 도로명건물본번호코드#248, 도로명건물부번호코드#249, 도로명시군구코드#250, 도로명일련번호코드#251, 도로명지상지하코드#252, 도로명코드#253, 동#254, 등기일자#255, 매도자#256, 매수자#257, 법정동#258, 법정동본번코드#259, 법정동부번코드#260, 법정동시군구코드#261, 법정동읍면동코드#262, 법정동지번코드#263, 아파트#264, 월#265, 일#266, ... 9 more fields]
+- *(1) Scan ExistingRDD[거래금액#243,거래유형#244,건축년도#245,년#246,도로명#247,도로명건물본번호코드#248,도로명건물부번호코드#249,도로명시군구코드#250,도로명일련번호코드#251,도로명지상지하코드#252,도로명코드#253,동#254,등기일자#255,매도자#256,매수자#257,법정동#258,법정동본번코드#259,법정동부번코드#260,법정동시군구코드#261,법정동읍면동코드#262,법정동지번코드#263,아파트#264,월#265,일#266,... 8 more fields]




In [99]:
apt_spark_df2.select("날짜").limit(3).show()

+----------+
|      날짜|
+----------+
|2024-05-01|
|2024-05-03|
|2024-05-03|
+----------+



In [100]:
apt_spark_df2.withColumn("날짜",to_date("날짜","yyyy-MM-dd")).limit(3).show()

+----------+--------+--------+----+------+--------------------+--------------------+----------------+------------------+------------------+----------+---+--------+------+------+-------+--------------+--------------+----------------+----------------+--------------+------------+---+---+----------+--------+------------+------+--------+---+--------------+--------+----------+
|  거래금액|거래유형|건축년도|  년|도로명|도로명건물본번호코드|도로명건물부번호코드|도로명시군구코드|도로명일련번호코드|도로명지상지하코드|도로명코드| 동|등기일자|매도자|매수자| 법정동|법정동본번코드|법정동부번코드|법정동시군구코드|법정동읍면동코드|법정동지번코드|      아파트| 월| 일|  일련번호|전용면적|중개사소재지|  지번|지역코드| 층|해제사유발생일|해제여부|      날짜|
+----------+--------+--------+----+------+--------------------+--------------------+----------------+------------------+------------------+----------+---+--------+------+------+-------+--------------+--------------+----------------+----------------+--------------+------------+---+---+----------+--------+------------+------+--------+---+--------------+--------+----------+
|   143,000|중개거래|    1997|2024|언주로|

In [101]:
apt_spark_df3 = apt_spark_df2.withColumn("날짜",to_date("날짜","yyyy-MM-dd"))

In [102]:
apt_spark_df3.printSchema()

root
 |-- 거래금액: string (nullable = true)
 |-- 거래유형: string (nullable = true)
 |-- 건축년도: string (nullable = true)
 |-- 년: string (nullable = true)
 |-- 도로명: string (nullable = true)
 |-- 도로명건물본번호코드: string (nullable = true)
 |-- 도로명건물부번호코드: string (nullable = true)
 |-- 도로명시군구코드: string (nullable = true)
 |-- 도로명일련번호코드: string (nullable = true)
 |-- 도로명지상지하코드: string (nullable = true)
 |-- 도로명코드: string (nullable = true)
 |-- 동: string (nullable = true)
 |-- 등기일자: string (nullable = true)
 |-- 매도자: string (nullable = true)
 |-- 매수자: string (nullable = true)
 |-- 법정동: string (nullable = true)
 |-- 법정동본번코드: string (nullable = true)
 |-- 법정동부번코드: string (nullable = true)
 |-- 법정동시군구코드: string (nullable = true)
 |-- 법정동읍면동코드: string (nullable = true)
 |-- 법정동지번코드: string (nullable = true)
 |-- 아파트: string (nullable = true)
 |-- 월: string (nullable = true)
 |-- 일: string (nullable = true)
 |-- 일련번호: string (nullable = true)
 |-- 전용면적: string (nullable = true)
 |-- 중개사소재지: string (nullable = 

In [103]:
apt_spark_df3.first()

Row(거래금액='   143,000', 거래유형='중개거래', 건축년도='1997', 년='2024', 도로명='언주로', 도로명건물본번호코드='00316', 도로명건물부번호코드='00000', 도로명시군구코드='11680', 도로명일련번호코드='01', 도로명지상지하코드='0', 도로명코드='3005086', 동=' ', 등기일자=' ', 매도자='개인', 매수자='개인', 법정동=' 역삼동', 법정동본번코드='0761', 법정동부번코드='0010', 법정동시군구코드='11680', 법정동읍면동코드='10100', 법정동지번코드='1', 아파트='대림역삼', 월='5', 일='1', 일련번호='11680-236', 전용면적='59.66', 중개사소재지='서울 강남구', 지번='761-10', 지역코드='11680', 층='12', 해제사유발생일=' ', 해제여부=' ', 날짜=datetime.date(2024, 5, 1))

In [104]:
apt["건축년도"] = apt["건축년도"].astype(int)

In [106]:
(2024 - apt["건축년도"]).mean()

np.float64(22.481203007518797)

In [108]:
apt_spark_df3 = apt_spark_df3.withColumn("건축년도",col("건축년도").cast("int"))

In [110]:
from pyspark.sql.functions import expr
apt_spark_df3.select("건축년도", expr("2024 - `건축년도`").alias("연식")).limit(3).show()

+--------+----+
|건축년도|연식|
+--------+----+
|    1997|  27|
|    1998|  26|
|    2016|   8|
+--------+----+



In [112]:
from pyspark.sql.functions import expr, mean, avg
apt_spark_df3.select("건축년도", expr("2024 - `건축년도`").alias("연식")).\
    select(mean("연식")).limit(3).show()


+------------------+
|         avg(연식)|
+------------------+
|22.481203007518797|
+------------------+



In [113]:
apt_spark_df3.createOrReplaceTempView("apt")

In [114]:
spark.sql("select * from apt limit 3").show()

+----------+--------+--------+----+------+--------------------+--------------------+----------------+------------------+------------------+----------+---+--------+------+------+-------+--------------+--------------+----------------+----------------+--------------+------------+---+---+----------+--------+------------+------+--------+---+--------------+--------+----------+
|  거래금액|거래유형|건축년도|  년|도로명|도로명건물본번호코드|도로명건물부번호코드|도로명시군구코드|도로명일련번호코드|도로명지상지하코드|도로명코드| 동|등기일자|매도자|매수자| 법정동|법정동본번코드|법정동부번코드|법정동시군구코드|법정동읍면동코드|법정동지번코드|      아파트| 월| 일|  일련번호|전용면적|중개사소재지|  지번|지역코드| 층|해제사유발생일|해제여부|      날짜|
+----------+--------+--------+----+------+--------------------+--------------------+----------------+------------------+------------------+----------+---+--------+------+------+-------+--------------+--------------+----------------+----------------+--------------+------------+---+---+----------+--------+------------+------+--------+---+--------------+--------+----------+
|   143,000|중개거래|    1997|2024|언주로|