In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()


dept = [("Finance",10), \
    ("Marketing",20), \
    ("Sales",30), \
    ("IT",40) \
  ]
deptColumns = ["dept_name","dept_id"]
deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF.show(truncate=False)


                                                                                

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+



In [2]:
sc

In [3]:
deptDF.collect()

[Row(dept_name='Finance', dept_id=10),
 Row(dept_name='Marketing', dept_id=20),
 Row(dept_name='Sales', dept_id=30),
 Row(dept_name='IT', dept_id=40)]

In [4]:
deptDF.select('dept_name').collect() #collect는 액션에 관한 거다

[Row(dept_name='Finance'),
 Row(dept_name='Marketing'),
 Row(dept_name='Sales'),
 Row(dept_name='IT')]

# withColum

In [5]:
data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

columns = ["firstname","middlename","lastname","dob","gender","salary"]

df = spark.createDataFrame(data=data, schema = columns)
df.show(truncate=False)


+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|dob       |gender|salary|
+---------+----------+--------+----------+------+------+
|James    |          |Smith   |1991-04-01|M     |3000  |
|Michael  |Rose      |        |2000-05-19|M     |4000  |
|Robert   |          |Williams|1978-09-05|M     |4000  |
|Maria    |Anne      |Jones   |1967-12-01|F     |4000  |
|Jen      |Mary      |Brown   |1980-02-17|F     |-1    |
+---------+----------+--------+----------+------+------+



In [6]:
from pyspark.sql.functions import col, lit

In [7]:
df.withColumn("salary",col("salary").cast("Integer")).printSchema()
# 원래는 LONG이었는데 INTEGER로 바뀜

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [8]:
df.withColumn("salary",col("salary").cast("Integer")).show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [9]:
df.withColumn("salary_t",col("salary").cast("Integer")).printSchema()
# 새로운 커럼 생성한다

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- salary_t: integer (nullable = true)



In [10]:
df.withColumn("salary_t",col("salary").cast("Integer")*1400).show()
# collect와 show는 액션

+---------+----------+--------+----------+------+------+--------+
|firstname|middlename|lastname|       dob|gender|salary|salary_t|
+---------+----------+--------+----------+------+------+--------+
|    James|          |   Smith|1991-04-01|     M|  3000| 4200000|
|  Michael|      Rose|        |2000-05-19|     M|  4000| 5600000|
|   Robert|          |Williams|1978-09-05|     M|  4000| 5600000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000| 5600000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|   -1400|
+---------+----------+--------+----------+------+------+--------+



In [11]:
df.withColumn("country", lit("korea"))
# 파생변수 생성

DataFrame[firstname: string, middlename: string, lastname: string, dob: string, gender: string, salary: bigint, country: string]

In [12]:
df.withColumn("country", lit("korea")).show()
# 앞서 취소하덜라도 qeue에 들어가있으므로 실행 x

+---------+----------+--------+----------+------+------+-------+
|firstname|middlename|lastname|       dob|gender|salary|country|
+---------+----------+--------+----------+------+------+-------+
|    James|          |   Smith|1991-04-01|     M|  3000|  korea|
|  Michael|      Rose|        |2000-05-19|     M|  4000|  korea|
|   Robert|          |Williams|1978-09-05|     M|  4000|  korea|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|  korea|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|  korea|
+---------+----------+--------+----------+------+------+-------+



In [13]:
df.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [14]:
df2 = df.withColumn("country", lit("korea"))

In [15]:
df2.show()

+---------+----------+--------+----------+------+------+-------+
|firstname|middlename|lastname|       dob|gender|salary|country|
+---------+----------+--------+----------+------+------+-------+
|    James|          |   Smith|1991-04-01|     M|  3000|  korea|
|  Michael|      Rose|        |2000-05-19|     M|  4000|  korea|
|   Robert|          |Williams|1978-09-05|     M|  4000|  korea|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|  korea|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|  korea|
+---------+----------+--------+----------+------+------+-------+



In [16]:
df2 = df.withColumnRenamed("salary","월급")

In [17]:
df2.show()

+---------+----------+--------+----------+------+----+
|firstname|middlename|lastname|       dob|gender|월급|
+---------+----------+--------+----------+------+----+
|    James|          |   Smith|1991-04-01|     M|3000|
|  Michael|      Rose|        |2000-05-19|     M|4000|
|   Robert|          |Williams|1978-09-05|     M|4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|  -1|
+---------+----------+--------+----------+------+----+



In [18]:
df2.drop("country").show()

+---------+----------+--------+----------+------+----+
|firstname|middlename|lastname|       dob|gender|월급|
+---------+----------+--------+----------+------+----+
|    James|          |   Smith|1991-04-01|     M|3000|
|  Michael|      Rose|        |2000-05-19|     M|4000|
|   Robert|          |Williams|1978-09-05|     M|4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|  -1|
+---------+----------+--------+----------+------+----+



In [19]:
df2 = df2.drop("country")

In [20]:
df2.show()

+---------+----------+--------+----------+------+----+
|firstname|middlename|lastname|       dob|gender|월급|
+---------+----------+--------+----------+------+----+
|    James|          |   Smith|1991-04-01|     M|3000|
|  Michael|      Rose|        |2000-05-19|     M|4000|
|   Robert|          |Williams|1978-09-05|     M|4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|  -1|
+---------+----------+--------+----------+------+----+



In [21]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [22]:
import pandas as pd

In [23]:
pandas_df = pd.DataFrame({'a' : [1,2,3], 'b' :[2,3,4]})


In [24]:
spark_df = spark.createDataFrame(pandas_df)

In [25]:
spark_df.show()

+---+---+
|  a|  b|
+---+---+
|  1|  2|
|  2|  3|
|  3|  4|
+---+---+



In [26]:
import requests

url = "https://www.starbucks.co.kr/store/getStore.do?r=3PS46TOGMA"


payload = {"in_biz_cds": "0",
"in_scodes": "0",
"ins_lat": "37.56682",
"ins_lng": "126.97865",
"search_text": "",
"p_sido_cd": "01",
"p_gugun_cd": "",
"isError": "true",
"in_distance": "0",
"in_biz_cd": "",
"iend": "1000",
"searchType": "C",
"set_date": "",
"rndCod": "NTM40TCD0S",
"all_store": "0",
"T03": "0",
"T01": "0",
"T27": "0",
"T12": "0",
"T09": "0",
"T30": "0",
"T05": "0",
"T22": "0",
"T21": "0",
"T10": "0",
"T36": "0",
"T43": "0",
"T48": "0",
"Z9999": "0",
"P02": "0",
"P10": "0",
"P50": "0",
"P20": "0",
"P60": "0",
"P30": "0",
"P70": "0",
"P40": "0",
"P80": "0",
"whcroad_yn": "0",
"P90": "0",
"P01": "0",
"new_bool": "0",}



In [27]:
r = requests.post(url,data=payload)
# r.json()["list"]

In [28]:
star = pd.DataFrame(r.json()["list"])

In [29]:
star['s_name'].isnull().sum()

0

In [30]:
star2 = star.copy()

In [31]:
star2.loc[1,'s_name'] = None

In [32]:
star2['s_name'].isnull().sum()

1

In [33]:
# star.dropna()
#결측치가 하나라도 있으면 닷

In [34]:
star.dropna(axis=1).isnull().sum().sum()

0

In [35]:
star.dropna(axis=1, thresh=100)

Unnamed: 0,seq,p_pro_seq,p_sido_cd,p_gugun_cd,s_code,s_name,tel,fax,sido_code,sido_name,...,p01,t05,t30,t36,t27,t29,t43,t48,z9999,p02
0,0,0,,,1509,역삼아레나빌딩,1522-3232,02-568-3763,01,서울,...,0,0,0,0,0,0,0,0,0,0
1,0,0,,,1434,논현역사거리,1522-3232,02-3442-3673,01,서울,...,0,0,0,0,0,0,0,0,0,0
2,0,0,,,1595,신사역성일빌딩,1522-3232,02-547-3859,01,서울,...,0,0,0,0,0,0,0,0,0,0
3,0,0,,,1527,국기원사거리,1522-3232,02-568-3669,01,서울,...,0,0,0,0,0,0,0,0,0,0
4,0,0,,,1468,대치재경빌딩,1522-3232,02-568-3705,01,서울,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,0,0,,,838,사가정역,1522-3232,02-435-8823,01,서울,...,0,0,0,0,0,0,0,0,0,0
608,0,0,,,493,상봉역,1522-3232,02-433-8486,01,서울,...,0,0,0,0,0,0,0,0,0,0
609,0,0,,,1668,묵동,1522-3232,02-971-3937,01,서울,...,0,0,0,0,0,0,0,0,0,0
610,0,0,,,2002,양원역,1522-3232,02-433-4308,01,서울,...,0,0,0,0,0,0,0,0,0,0


In [36]:
starbucks_df = spark.createDataFrame(star[['s_name']])
# 결측가 1개있어도된다

In [37]:
starbucks_df.show()

+-----------------+
|           s_name|
+-----------------+
|   역삼아레나빌딩|
|     논현역사거리|
|   신사역성일빌딩|
|     국기원사거리|
|     대치재경빌딩|
|         봉은사역|
|   압구정윤성빌딩|
|     코엑스별마당|
|  삼성역섬유센터R|
|          압구정R|
|          수서역R|
|    양재강남빌딩R|
|    선릉동신빌딩R|
|   봉은사로선정릉|
|       강남오거리|
|스타필드코엑스몰R|
|     강남구청정문|
|         도곡공원|
|            강남R|
|   대치은마사거리|
+-----------------+
only showing top 20 rows



In [38]:
star = pd.DataFrame(r.json()['list'])
starbucks_df = spark.createDataFrame(star.dropna(axis=1))


In [39]:
starbucks_df.limit(1).show()

24/06/18 03:13:03 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+---+---------+---------+----------+------+--------------+---------+-----------+---------+---------+----------+----------+-------------------------------+--------------------+--------+-----------+-------+-------+-----------+------------+--------------+---------------+----------------+--------+--------+--------+----------+--------+--------+---------+--------------------+----------+---------+----+------+----+--------+---------+------+-------+--------+---------+-------------+-------------------------------+---------+-----------------------+----------+-----+-------+---------+-----------+---------------+---------+----------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+-----+---+
|seq|p_pro_seq|p_sido_cd|p_gugun_cd|s_code|        s_name|      tel|        fax|sido_code|sido_name|gugun_code|gugun_name|                           addr|         theme_state|new_bool|search_text|ins_lat|ins_lng|in_distance|out_distance|all_search_cnt|ad

In [40]:
# star

In [41]:
# columns = ["s_code","s_name","open_dt","espresso","addr"]
# temp = spark.createDataFrame(data=starbucks_df, schema = columns)
starbucks_df.select("s_code","s_name","open_dt","espresso","addr").limit(3).show()

+------+--------------+--------+--------+-------------------------------+
|s_code|        s_name| open_dt|espresso|                           addr|
+------+--------------+--------+--------+-------------------------------+
|  1509|역삼아레나빌딩|20190613|        |서울특별시 강남구 역삼동 721...|
|  1434|  논현역사거리|20181123|        |서울특별시 강남구 논현동 142...|
|  1595|신사역성일빌딩|20191219|        |서울특별시 강남구 논현동 18-...|
+------+--------------+--------+--------+-------------------------------+



In [42]:
star_df = starbucks_df.select("s_code","s_name","open_dt","espresso","addr")

In [43]:
star_df

DataFrame[s_code: string, s_name: string, open_dt: string, espresso: string, addr: string]

In [44]:
star_df.limit(2).show()

+------+--------------+--------+--------+-------------------------------+
|s_code|        s_name| open_dt|espresso|                           addr|
+------+--------------+--------+--------+-------------------------------+
|  1509|역삼아레나빌딩|20190613|        |서울특별시 강남구 역삼동 721...|
|  1434|  논현역사거리|20181123|        |서울특별시 강남구 논현동 142...|
+------+--------------+--------+--------+-------------------------------+



s_name = 매장이름
s_code = 매장코드
open_dt = 개장일
addr = 주소


In [45]:
star_df = star_df.withColumnsRenamed({"s_name" : "매장이름","s_code":"매장코드" , "open_dt" : "개장일" , "addr" :"주소"})
# df2 = df.withColumnRenamed("salary","월급")
# star_df.withColumnsRenamed

In [46]:
star_df.toDF(*['매장이름','매장코드','개장일','espresson','주소']).limit(2).show()

+--------+--------------+--------+---------+-------------------------------+
|매장이름|      매장코드|  개장일|espresson|                           주소|
+--------+--------------+--------+---------+-------------------------------+
|    1509|역삼아레나빌딩|20190613|         |서울특별시 강남구 역삼동 721...|
|    1434|  논현역사거리|20181123|         |서울특별시 강남구 논현동 142...|
+--------+--------------+--------+---------+-------------------------------+



In [79]:
star_df2 = star_df.toDF(*['매장이름','매장코드','개장일','espresso','주소'])

In [48]:
from pyspark.sql.functions import split

In [66]:
star_df2.show()

+--------+-----------------+--------+--------+-------------------------------+
|매장이름|         매장코드|  개장일|espresso|                           주소|
+--------+-----------------+--------+--------+-------------------------------+
|    1509|   역삼아레나빌딩|20190613|        |서울특별시 강남구 역삼동 721...|
|    1434|     논현역사거리|20181123|        |서울특별시 강남구 논현동 142...|
|    1595|   신사역성일빌딩|20191219|        |서울특별시 강남구 논현동 18-...|
|    1527|     국기원사거리|20190731|        |서울특별시 강남구 역삼동 648...|
|    1468|     대치재경빌딩|20190214|        |서울특별시 강남구 대치동 599...|
|    1640|         봉은사역|20200528|        |서울특별시 강남구 삼성동 108...|
|    1650|   압구정윤성빌딩|20200529|        |서울특별시 강남구 신사동 592...|
|    1626|     코엑스별마당|20200416|        |서울특별시 강남구 삼성동 159...|
|    1438|  삼성역섬유센터R|20181123|        |서울특별시 강남구 대치동 944...|
|    1411|          압구정R|20180831|        | 서울특별시 강남구 신사동 621-1|
|    1389|          수서역R|20180627|        |서울특별시 강남구 수서동 715...|
|    1404|    양재강남빌딩R|20180824|        |서울특별시 강남구 도곡동 956...|
|    1355|    선릉동신빌딩R|201803

In [67]:
star_df2.withColumn('split_col', split(star_df2['주소']," ")).limit(2).show()

+--------+--------------+--------+--------+-------------------------------+-------------------------------+
|매장이름|      매장코드|  개장일|espresso|                           주소|                      split_col|
+--------+--------------+--------+--------+-------------------------------+-------------------------------+
|    1509|역삼아레나빌딩|20190613|        |서울특별시 강남구 역삼동 721...|[서울특별시, 강남구, 역삼동,...|
|    1434|  논현역사거리|20181123|        |서울특별시 강남구 논현동 142...|[서울특별시, 강남구, 논현동,...|
+--------+--------------+--------+--------+-------------------------------+-------------------------------+



In [68]:
star_df2 = star_df2.withColumn('split_col', split(star_df2['주소']," "))

In [69]:
star_df2.show()

+--------+-----------------+--------+--------+-------------------------------+-------------------------------+
|매장이름|         매장코드|  개장일|espresso|                           주소|                      split_col|
+--------+-----------------+--------+--------+-------------------------------+-------------------------------+
|    1509|   역삼아레나빌딩|20190613|        |서울특별시 강남구 역삼동 721...|[서울특별시, 강남구, 역삼동,...|
|    1434|     논현역사거리|20181123|        |서울특별시 강남구 논현동 142...|[서울특별시, 강남구, 논현동,...|
|    1595|   신사역성일빌딩|20191219|        |서울특별시 강남구 논현동 18-...|[서울특별시, 강남구, 논현동,...|
|    1527|     국기원사거리|20190731|        |서울특별시 강남구 역삼동 648...|[서울특별시, 강남구, 역삼동,...|
|    1468|     대치재경빌딩|20190214|        |서울특별시 강남구 대치동 599...|[서울특별시, 강남구, 대치동,...|
|    1640|         봉은사역|20200528|        |서울특별시 강남구 삼성동 108...|[서울특별시, 강남구, 삼성동,...|
|    1650|   압구정윤성빌딩|20200529|        |서울특별시 강남구 신사동 592...|[서울특별시, 강남구, 신사동,...|
|    1626|     코엑스별마당|20200416|        |서울특별시 강남구 삼성동 159...|[서울특별시, 강남구, 삼성동,...|
|    1438|  삼성역섬유

In [70]:
star_df2 = star_df2.withColumn('구', star_df2["split_col"][1])

In [71]:
star_df2[['split_col']].show()

+-------------------------------+
|                      split_col|
+-------------------------------+
|[서울특별시, 강남구, 역삼동,...|
|[서울특별시, 강남구, 논현동,...|
|[서울특별시, 강남구, 논현동,...|
|[서울특별시, 강남구, 역삼동,...|
|[서울특별시, 강남구, 대치동,...|
|[서울특별시, 강남구, 삼성동,...|
|[서울특별시, 강남구, 신사동,...|
|[서울특별시, 강남구, 삼성동,...|
|[서울특별시, 강남구, 대치동,...|
|[서울특별시, 강남구, 신사동,...|
|[서울특별시, 강남구, 수서동,...|
|[서울특별시, 강남구, 도곡동,...|
|[서울특별시, 강남구, 삼성동,...|
|[서울특별시, 강남구, 삼성동,...|
|[서울특별시, 강남구, 역삼동,...|
|[서울특별시, 강남구, 삼성동,...|
|[서울특별시, 강남구, 청담동,...|
|[서울특별시, 강남구, 역삼동,...|
|[서울특별시, 강남구, 역삼동,...|
|[서울특별시, 강남구, 대치동,...|
+-------------------------------+
only showing top 20 rows



In [72]:
star_df2.printSchema()

root
 |-- 매장이름: string (nullable = true)
 |-- 매장코드: string (nullable = true)
 |-- 개장일: string (nullable = true)
 |-- espresso: string (nullable = true)
 |-- 주소: string (nullable = true)
 |-- split_col: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- 구: string (nullable = true)



In [73]:
star_df2.groupby("구").count().show()

+--------+-----+
|      구|count|
+--------+-----+
|  도봉구|    7|
|  마포구|   36|
|  강서구|   27|
|  강남구|   91|
|  광진구|   19|
|  관악구|   12|
|  동작구|   11|
|  노원구|   14|
|서대문구|   21|
|  금천구|   13|
|  성북구|   15|
|  강북구|    6|
|  서초구|   48|
|  송파구|   37|
|  은평구|   14|
|  양천구|   17|
|  중랑구|    8|
|  강동구|   17|
|  용산구|   24|
|  구로구|   14|
+--------+-----+
only showing top 20 rows



In [74]:
star_df2.groupby("구")\
    .count()\
    .sort("count",ascending=False)\
    .show()

+--------+-----+
|      구|count|
+--------+-----+
|  강남구|   91|
|    중구|   52|
|  서초구|   48|
|영등포구|   43|
|  종로구|   39|
|  송파구|   37|
|  마포구|   36|
|  강서구|   27|
|  용산구|   24|
|서대문구|   21|
|  광진구|   19|
|  양천구|   17|
|  강동구|   17|
|  성북구|   15|
|  노원구|   14|
|  은평구|   14|
|  구로구|   14|
|  성동구|   14|
|  금천구|   13|
|동대문구|   13|
+--------+-----+
only showing top 20 rows



In [58]:
# espresso 컬럼삭제

In [75]:
star_df2 = star_df2.drop("espresso","split_col").show()

+--------+-----------------+--------+-------------------------------+------+
|매장이름|         매장코드|  개장일|                           주소|    구|
+--------+-----------------+--------+-------------------------------+------+
|    1509|   역삼아레나빌딩|20190613|서울특별시 강남구 역삼동 721...|강남구|
|    1434|     논현역사거리|20181123|서울특별시 강남구 논현동 142...|강남구|
|    1595|   신사역성일빌딩|20191219|서울특별시 강남구 논현동 18-...|강남구|
|    1527|     국기원사거리|20190731|서울특별시 강남구 역삼동 648...|강남구|
|    1468|     대치재경빌딩|20190214|서울특별시 강남구 대치동 599...|강남구|
|    1640|         봉은사역|20200528|서울특별시 강남구 삼성동 108...|강남구|
|    1650|   압구정윤성빌딩|20200529|서울특별시 강남구 신사동 592...|강남구|
|    1626|     코엑스별마당|20200416|서울특별시 강남구 삼성동 159...|강남구|
|    1438|  삼성역섬유센터R|20181123|서울특별시 강남구 대치동 944...|강남구|
|    1411|          압구정R|20180831| 서울특별시 강남구 신사동 621-1|강남구|
|    1389|          수서역R|20180627|서울특별시 강남구 수서동 715...|강남구|
|    1404|    양재강남빌딩R|20180824|서울특별시 강남구 도곡동 956...|강남구|
|    1355|    선릉동신빌딩R|20180330|서울특별시 강남구 삼성동 141...|강남구|
|    1348|   봉은사로선정릉|20180306|서울특별시 강남

In [80]:
url = "https://www.starbucks.co.kr/store/getStore.do?r=3PS46TOGMA"


payload = {"in_biz_cds": "0",
"in_scodes": "0",
"ins_lat": "37.56682",
"ins_lng": "126.97865",
"search_text": "",
"p_sido_cd": "01",
"p_gugun_cd": "",
"isError": "true",
"in_distance": "0",
"in_biz_cd": "",
"iend": "1000",
"searchType": "C",
"set_date": "",
"rndCod": "NTM40TCD0S",
"all_store": "0",
"T03": "0",
"T01": "0",
"T27": "0",
"T12": "0",
"T09": "0",
"T30": "0",
"T05": "0",
"T22": "0",
"T21": "0",
"T10": "0",
"T36": "0",
"T43": "0",
"T48": "0",
"Z9999": "0",
"P02": "0",
"P10": "0",
"P50": "0",
"P20": "0",
"P60": "0",
"P30": "0",
"P70": "0",
"P40": "0",
"P80": "0",
"whcroad_yn": "0",
"P90": "0",
"P01": "0",
"new_bool": "0",}




star = pd.DataFrame(r.json()['list'])
starbucks_df = spark.createDataFrame(star.dropna(axis=1))


star_df = starbucks_df.select('s_name', "s_code", "open_dt", "espresso", "addr")


# 컬럼명 변경 - 첫번째 방법 
star_df.withColumnRenamed('s_name', '매장이름').\
    withColumnRenamed('s_code', '매장코드').\
    withColumnRenamed('open_dt', '개장일').\
    withColumnRenamed('addr', '주소')

# 두번째 방법 
star_df2= star_df.toDF(*['매장이름', '매장코드', '개장일','espresso', '주소'])


from pyspark.sql.functions import split

star_df2 = star_df2.withColumn('split_col', split(star_df2['주소'], " "))

star_df2 = star_df2.withColumn('구', star_df2["split_col"][1])

star_df2.groupby('구')\
    .count()\
    .sort('count', ascending=False)\
    .show()


star_df2 = star_df2.drop('espresso', 'split_col')


+--------+-----+
|      구|count|
+--------+-----+
|  강남구|   91|
|    중구|   52|
|  서초구|   48|
|영등포구|   43|
|  종로구|   39|
|  송파구|   37|
|  마포구|   36|
|  강서구|   27|
|  용산구|   24|
|서대문구|   21|
|  광진구|   19|
|  양천구|   17|
|  강동구|   17|
|  성북구|   15|
|  노원구|   14|
|  은평구|   14|
|  구로구|   14|
|  성동구|   14|
|  금천구|   13|
|동대문구|   13|
+--------+-----+
only showing top 20 rows



In [81]:
star_df2.createOrReplaceTempView("starbucks")

In [82]:
spark.sql("select * from starbucks").show()

+-----------------+--------+--------+-------------------------------+------+
|         매장이름|매장코드|  개장일|                           주소|    구|
+-----------------+--------+--------+-------------------------------+------+
|   역삼아레나빌딩|    1509|20190613|서울특별시 강남구 역삼동 721...|강남구|
|     논현역사거리|    1434|20181123|서울특별시 강남구 논현동 142...|강남구|
|   신사역성일빌딩|    1595|20191219|서울특별시 강남구 논현동 18-...|강남구|
|     국기원사거리|    1527|20190731|서울특별시 강남구 역삼동 648...|강남구|
|     대치재경빌딩|    1468|20190214|서울특별시 강남구 대치동 599...|강남구|
|         봉은사역|    1640|20200528|서울특별시 강남구 삼성동 108...|강남구|
|   압구정윤성빌딩|    1650|20200529|서울특별시 강남구 신사동 592...|강남구|
|     코엑스별마당|    1626|20200416|서울특별시 강남구 삼성동 159...|강남구|
|  삼성역섬유센터R|    1438|20181123|서울특별시 강남구 대치동 944...|강남구|
|          압구정R|    1411|20180831| 서울특별시 강남구 신사동 621-1|강남구|
|          수서역R|    1389|20180627|서울특별시 강남구 수서동 715...|강남구|
|    양재강남빌딩R|    1404|20180824|서울특별시 강남구 도곡동 956...|강남구|
|    선릉동신빌딩R|    1355|20180330|서울특별시 강남구 삼성동 141...|강남구|
|   봉은사로선정릉|    1348|20180306|서울특별시 강남

In [87]:
star_df2.printSchema

<bound method DataFrame.printSchema of DataFrame[매장이름: string, 매장코드: string, 개장일: string, 주소: string, 구: string]>

In [91]:
spark.sql("SELECT `구`, COUNT(*) AS count FROM starbucks GROUP BY `구`").show()

+--------+-----+
|      구|count|
+--------+-----+
|  도봉구|    7|
|  마포구|   36|
|  강서구|   27|
|  강남구|   91|
|  광진구|   19|
|  관악구|   12|
|  동작구|   11|
|  노원구|   14|
|서대문구|   21|
|  금천구|   13|
|  성북구|   15|
|  강북구|    6|
|  서초구|   48|
|  송파구|   37|
|  은평구|   14|
|  양천구|   17|
|  중랑구|    8|
|  강동구|   17|
|  용산구|   24|
|  구로구|   14|
+--------+-----+
only showing top 20 rows



In [None]:
starbucks_df = spark.createDataFrame(star[['addr','s_name','sido_cd']])

In [None]:
# spark_json_df = spark.read.json(r.json()["list"])
# Py4JJavaError 
# 파이썬과 자바를 연결시켜주는데 원래는 자바로 도는데 우리는 파이썬으로하고있다.