In [1]:
from pyspark.sql import SparkSession, Row

In [2]:

spark = SparkSession.builder.master("local").appName("learn-sql").getOrCreate()


In [3]:
stocks = [
    ('Google', 'GOOGL', 'USA', 2984, 'USD'), 
    ('Netflix', 'NFLX', 'USA', 645, 'USD'),
    ('Amazon', 'AMZN', 'USA', 3518, 'USD'),
    ('Tesla', 'TSLA', 'USA', 1222, 'USD'),
    ('Tencent', '0700', 'Hong Kong', 483, 'HKD'),
    ('Toyota', '7203', 'Japan', 2006, 'JPY'),
    ('Samsung', '005930', 'Korea', 70600, 'KRW'),
    ('Kakao', '035720', 'Korea', 125000, 'KRW'),
]

schema = ["name", "ticker", "country", "price", "currency"]
df = spark.createDataFrame(data=stocks, schema=schema)

In [4]:
df.dtypes

[('name', 'string'),
 ('ticker', 'string'),
 ('country', 'string'),
 ('price', 'bigint'),
 ('currency', 'string')]

In [5]:
df.show()

+-------+------+---------+------+--------+
|   name|ticker|  country| price|currency|
+-------+------+---------+------+--------+
| Google| GOOGL|      USA|  2984|     USD|
|Netflix|  NFLX|      USA|   645|     USD|
| Amazon|  AMZN|      USA|  3518|     USD|
|  Tesla|  TSLA|      USA|  1222|     USD|
|Tencent|  0700|Hong Kong|   483|     HKD|
| Toyota|  7203|    Japan|  2006|     JPY|
|Samsung|005930|    Korea| 70600|     KRW|
|  Kakao|035720|    Korea|125000|     KRW|
+-------+------+---------+------+--------+



In [6]:
df.createOrReplaceTempView("stocks")

In [7]:
# selecting one column
spark.sql("select country from stocks").show()

+---------+
|  country|
+---------+
|      USA|
|      USA|
|      USA|
|      USA|
|Hong Kong|
|    Japan|
|    Korea|
|    Korea|
+---------+



In [8]:
# selecting multiple column
spark.sql("select name, price from stocks").show()

+-------+------+
|   name| price|
+-------+------+
| Google|  2984|
|Netflix|   645|
| Amazon|  3518|
|  Tesla|  1222|
|Tencent|   483|
| Toyota|  2006|
|Samsung| 70600|
|  Kakao|125000|
+-------+------+



In [9]:
# conditional
spark.sql("select name, price from stocks where country = 'Korea'").show()

+-------+------+
|   name| price|
+-------+------+
|Samsung| 70600|
|  Kakao|125000|
+-------+------+



In [10]:
# range
spark.sql("select name, price from stocks where price > 2000").show()

+-------+------+
|   name| price|
+-------+------+
| Google|  2984|
| Amazon|  3518|
| Toyota|  2006|
|Samsung| 70600|
|  Kakao|125000|
+-------+------+



In [11]:
# multiple conditional
spark.sql("select name, price from stocks where price > 2000 and country='USA'").show()

+------+-----+
|  name|price|
+------+-----+
|Google| 2984|
|Amazon| 3518|
+------+-----+



In [12]:
# like condition
spark.sql("select name, price from stocks where country like 'U%' and name not like '%e%'").show()

+------+-----+
|  name|price|
+------+-----+
|Amazon| 3518|
+------+-----+



In [13]:
# between condition
spark.sql("select name, price from stocks where price between 1000 and 10000").show()

+------+-----+
|  name|price|
+------+-----+
|Google| 2984|
|Amazon| 3518|
| Tesla| 1222|
|Toyota| 2006|
+------+-----+



In [14]:
# subset 1
spark.sql("select name, price, country from stocks where country = 'USA'").show()

+-------+-----+-------+
|   name|price|country|
+-------+-----+-------+
| Google| 2984|    USA|
|Netflix|  645|    USA|
| Amazon| 3518|    USA|
|  Tesla| 1222|    USA|
+-------+-----+-------+



In [15]:
# subset 2
spark.sql("select name, price, currency from stocks \
where currency='USD' and \
price > (select price from stocks where name = 'Tesla') ").show()

+------+-----+--------+
|  name|price|currency|
+------+-----+--------+
|Google| 2984|     USD|
|Amazon| 3518|     USD|
+------+-----+--------+



In [16]:
# order by asc
spark.sql("select name, price, country from stocks order by price asc").show()

+-------+------+---------+
|   name| price|  country|
+-------+------+---------+
|Tencent|   483|Hong Kong|
|Netflix|   645|      USA|
|  Tesla|  1222|      USA|
| Toyota|  2006|    Japan|
| Google|  2984|      USA|
| Amazon|  3518|      USA|
|Samsung| 70600|    Korea|
|  Kakao|125000|    Korea|
+-------+------+---------+



In [17]:
# order by desc
spark.sql("select name, price, country from stocks order by price desc").show()

+-------+------+---------+
|   name| price|  country|
+-------+------+---------+
|  Kakao|125000|    Korea|
|Samsung| 70600|    Korea|
| Amazon|  3518|      USA|
| Google|  2984|      USA|
| Toyota|  2006|    Japan|
|  Tesla|  1222|      USA|
|Netflix|   645|      USA|
|Tencent|   483|Hong Kong|
+-------+------+---------+



In [18]:
# order by length()
spark.sql('select name from stocks order by length(name)').show()

+-------+
|   name|
+-------+
|  Tesla|
|  Kakao|
| Amazon|
| Toyota|
| Google|
|Netflix|
|Samsung|
|Tencent|
+-------+



In [19]:
# sum()
spark.sql("select sum(price) from stocks where country='Korea'").show()

+----------+
|sum(price)|
+----------+
|    195600|
+----------+



In [20]:
# mean()
spark.sql("select mean(price) from stocks where country='Korea'").show()

+-----------+
|mean(price)|
+-----------+
|    97800.0|
+-----------+



In [21]:
# count()
spark.sql("select count(price) from stocks where country='Korea'").show()

+------------+
|count(price)|
+------------+
|           2|
+------------+



In [22]:
# in ()
spark.sql("select count(price) from stocks where country in ('Korea', 'USA') and price > 1000").show()

+------------+
|count(price)|
+------------+
|           5|
+------------+



In [25]:
# table 2개 합치기
earnings = [
    ('Google', 27.99, 'USD'), 
    ('Netflix', 2.56, 'USD'),
    ('Amazon', 6.12, 'USD'),
    ('Tesla', 1.86, 'USD'),
    ('Tencent', 11.01, 'HKD'),
    ('Toyota', 224.82, 'JPY'),
    ('Samsung', 1780., 'KRW'),
    ('Kakao', 705., 'KRW')
]
# earningsSchema = ["name", "eps", "currency"]
from pyspark.sql.types import MapType,StringType,IntegerType,FloatType,StructType, StructField

earningsSchema = StructType([ 
    StructField("name",StringType(),True), 
    StructField("eps",FloatType(),True), 
    StructField("currency",StringType(),True), 
])
earningsDF = spark.createDataFrame(data=earnings, schema=earningsSchema)
earningsDF.createOrReplaceTempView("earnings")

In [26]:
earningsDF.select("*").show()

+-------+------+--------+
|   name|   eps|currency|
+-------+------+--------+
| Google| 27.99|     USD|
|Netflix|  2.56|     USD|
| Amazon|  6.12|     USD|
|  Tesla|  1.86|     USD|
|Tencent| 11.01|     HKD|
| Toyota|224.82|     JPY|
|Samsung|1780.0|     KRW|
|  Kakao| 705.0|     KRW|
+-------+------+--------+



In [27]:
spark.sql("select * from stocks join earnings on stocks.name = earnings.name").show()

+-------+------+---------+------+--------+-------+------+--------+
|   name|ticker|  country| price|currency|   name|   eps|currency|
+-------+------+---------+------+--------+-------+------+--------+
| Amazon|  AMZN|      USA|  3518|     USD| Amazon|  6.12|     USD|
| Google| GOOGL|      USA|  2984|     USD| Google| 27.99|     USD|
|  Kakao|035720|    Korea|125000|     KRW|  Kakao| 705.0|     KRW|
|Netflix|  NFLX|      USA|   645|     USD|Netflix|  2.56|     USD|
|Samsung|005930|    Korea| 70600|     KRW|Samsung|1780.0|     KRW|
|Tencent|  0700|Hong Kong|   483|     HKD|Tencent| 11.01|     HKD|
|  Tesla|  TSLA|      USA|  1222|     USD|  Tesla|  1.86|     USD|
| Toyota|  7203|    Japan|  2006|     JPY| Toyota|224.82|     JPY|
+-------+------+---------+------+--------+-------+------+--------+



In [28]:
# 주가 수익 비율 (PER) - 회사가 버는 돈에 비해 얼마나 주식이 비싼가
spark.sql("select (stocks.price/earnings.eps) from stocks join earnings on stocks.name = earnings.name").show()

+------------------+
|     (price / eps)|
+------------------+
| 574.8366120563447|
| 106.6095042658442|
| 177.3049645390071|
| 251.9531306315913|
|39.662921348314605|
| 43.86920889728746|
|  656.989242258975|
| 8.922693419839167|
+------------------+

