## PySpark - col, select , alias

- 스파크의 모든 변환 작업은 **`pyspark.sql.functions`** 모듈에 있는 도구들을 가져와서 사용

1. select

- 원하는 컬럼을 선택해서 불러올 수 있다
- 문자열 방식, DataFrame방식-공백 특수문자 있으면 위험,col() 방식



>스파크에서 모든 변환 함수(select, withColumn, filter 등)는 원본을 수정하는 게 아니라, 
수정된 복사본(새로운 DataFrame)을 준다는 거를 잊지말자!

In [1]:
from pyspark.sql import (
    Row,
    SparkSession)
import pyspark.sql.functions as F

In [2]:
spark=(
    SparkSession
    .builder
    .appName("col_select_study")
    .master("spark://spark-master:7077")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/29 07:58:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df=spark.read.csv(
    "file:///workspace/data/col_select_practice.csv",
    header=True,
    inferSchema=True
)
df.show()
df.printSchema()

                                                                                

+------+----+-----------+----------+---+------+
|emp_id|name|   nickname|department|age|salary|
+------+----+-----------+----------+---+------+
|     1| Kim|   DataKing|      Data| 29|  5200|
|     2| Lee|QueryMaster|      Data| 35|  6800|
|     3|Park| PeopleGuru|        HR| 41|  4500|
|     4|Choi|    FreshHR|        HR| 28|  4000|
|     5|Jung|     Closer|     Sales| 33|  6100|
|     6| Han|  TopSeller|     Sales| 39|  7300|
|     7| Seo|  YoungData|      Data| 26|  4800|
|     8|Yoon| RisingStar|     Sales| 30|  5900|
+------+----+-----------+----------+---+------+

root
 |-- emp_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- nickname: string (nullable = true)
 |-- department: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [4]:
#select

In [5]:
df.select("name","age").show()

+----+---+
|name|age|
+----+---+
| Kim| 29|
| Lee| 35|
|Park| 41|
|Choi| 28|
|Jung| 33|
| Han| 39|
| Seo| 26|
|Yoon| 30|
+----+---+



In [6]:
df.select(df.name,df.age).show()

+----+---+
|name|age|
+----+---+
| Kim| 29|
| Lee| 35|
|Park| 41|
|Choi| 28|
|Jung| 33|
| Han| 39|
| Seo| 26|
|Yoon| 30|
+----+---+



In [7]:
# import를 F로 했기때문 
# col 객체를 쓰는 이유? 연산이 가능하다
df.select(F.col("name"),F.col("age")).show()

+----+---+
|name|age|
+----+---+
| Kim| 29|
| Lee| 35|
|Park| 41|
|Choi| 28|
|Jung| 33|
| Han| 39|
| Seo| 26|
|Yoon| 30|
+----+---+



In [8]:
# col 

In [9]:
# 문자열 연산 불가
# df.select("age"+1).show()

In [10]:
df.select(
    F.col("name"),
    (F.col("age")+1)
).show()

+----+---------+
|name|(age + 1)|
+----+---------+
| Kim|       30|
| Lee|       36|
|Park|       42|
|Choi|       29|
|Jung|       34|
| Han|       40|
| Seo|       27|
|Yoon|       31|
+----+---------+



In [11]:
# spark는 계산하는 순간 컬럼이름을 수식 그대로 바꿔버린다.
# 무조건 alias붙이기

In [12]:
df.select(
    F.col("name"),
    (F.col("age")+1).alias("feature_age")
).show()

+----+-----------+
|name|feature_age|
+----+-----------+
| Kim|         30|
| Lee|         36|
|Park|         42|
|Choi|         29|
|Jung|         34|
| Han|         40|
| Seo|         27|
|Yoon|         31|
+----+-----------+



In [13]:
# cast타입변환
df.select(
    F.col("age"),
    F.col("age").cast("string").alias("age_str")
).show()
df.printSchema()
# printSchema()에서 안바뀌는 이유가 원본은 불변이기때문 

+---+-------+
|age|age_str|
+---+-------+
| 29|     29|
| 35|     35|
| 41|     41|
| 28|     28|
| 33|     33|
| 39|     39|
| 26|     26|
| 30|     30|
+---+-------+

root
 |-- emp_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- nickname: string (nullable = true)
 |-- department: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [17]:
df.select(
    F.col("name"),
    F.col("department")
).filter(
    F.col("department").isin("Data", "HR")
).show()

+----+----------+
|name|department|
+----+----------+
| Kim|      Data|
| Lee|      Data|
|Park|        HR|
|Choi|        HR|
| Seo|      Data|
+----+----------+



In [18]:
df.select(
    F.col("name"),
    F.col("age")
).filter(
    F.col("age").between(30,40)
).show()

+----+---+
|name|age|
+----+---+
| Lee| 35|
|Jung| 33|
| Han| 39|
|Yoon| 30|
+----+---+



In [19]:
df.filter(F.col("name").contains("Kim")).show()

+------+----+--------+----------+---+------+
|emp_id|name|nickname|department|age|salary|
+------+----+--------+----------+---+------+
|     1| Kim|DataKing|      Data| 29|  5200|
+------+----+--------+----------+---+------+



In [20]:
df.filter(F.col("nickname").startswith("Data")).show()

+------+----+--------+----------+---+------+
|emp_id|name|nickname|department|age|salary|
+------+----+--------+----------+---+------+
|     1| Kim|DataKing|      Data| 29|  5200|
+------+----+--------+----------+---+------+



In [21]:
spark.stop()