## Pyspark _Null

`df.na` 제공

1. 제거 drop()
2. 채우기 fill()

> Spark DataFrame은 한번 만들어지면 절대 수정할수 없다! 불변성!
> 
> 변경된 결과를 반드시 변수에 다시 담아야 한다!

In [2]:
from pyspark.sql import (
    Row,
    SparkSession)
import pyspark.sql.functions as F
from pyspark.sql.functions import col, isnan, when, count
from pyspark.sql.types import *

In [3]:
spark=(
    SparkSession
    .builder
    .appName("drop-fill")
    .master("spark://spark-master:7077")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/30 13:27:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
data = [
    (1, "Alice",  "Engineer", 5000, "2023-01-10"),
    (2, "Bob",    None,        None, "2023-01-15"),
    (3, None,     "Designer",  4000, None),
    (4, "David",  "Engineer",  None, "2023-02-01"),
    (5, None,     None,        None, None),
]

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("occupation", StringType(), True),
    StructField("salary", IntegerType(), True),
    StructField("date", StringType(), True),
])

In [5]:
df=spark.createDataFrame(data,schema)
df.show()
df.printSchema()

                                                                                

+---+-----+----------+------+----------+
| id| name|occupation|salary|      date|
+---+-----+----------+------+----------+
|  1|Alice|  Engineer|  5000|2023-01-10|
|  2|  Bob|      NULL|  NULL|2023-01-15|
|  3| NULL|  Designer|  4000|      NULL|
|  4|David|  Engineer|  NULL|2023-02-01|
|  5| NULL|      NULL|  NULL|      NULL|
+---+-----+----------+------+----------+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- date: string (nullable = true)



In [6]:
# date -> 날짜 변환    

In [7]:
df=df.withColumn("date",F.to_date("date"))
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- date: date (nullable = true)



**pandas의 isna().sum()이 스파크에 없다!**

In [8]:
df.filter(F.col("salary").isNull()).count()

3

In [9]:
# 모든 컬럼의 결측치 확인 
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+---+----+----------+------+----+
| id|name|occupation|salary|date|
+---+----+----------+------+----+
|  0|   2|         2|     3|   2|
+---+----+----------+------+----+



In [10]:
# SQL스타일도 있다.
df.filter("salary IS NULL").count()

3

In [11]:
# drop

In [12]:
df.na.drop(how="any").show()

+---+-----+----------+------+----------+
| id| name|occupation|salary|      date|
+---+-----+----------+------+----------+
|  1|Alice|  Engineer|  5000|2023-01-10|
+---+-----+----------+------+----------+



In [13]:
df.na.drop(how='all').show()

+---+-----+----------+------+----------+
| id| name|occupation|salary|      date|
+---+-----+----------+------+----------+
|  1|Alice|  Engineer|  5000|2023-01-10|
|  2|  Bob|      NULL|  NULL|2023-01-15|
|  3| NULL|  Designer|  4000|      NULL|
|  4|David|  Engineer|  NULL|2023-02-01|
|  5| NULL|      NULL|  NULL|      NULL|
+---+-----+----------+------+----------+



In [14]:
df.na.drop(subset=["salary"]).show()

+---+-----+----------+------+----------+
| id| name|occupation|salary|      date|
+---+-----+----------+------+----------+
|  1|Alice|  Engineer|  5000|2023-01-10|
|  3| NULL|  Designer|  4000|      NULL|
+---+-----+----------+------+----------+



In [15]:
df.na.drop(thresh=2).show()

+---+-----+----------+------+----------+
| id| name|occupation|salary|      date|
+---+-----+----------+------+----------+
|  1|Alice|  Engineer|  5000|2023-01-10|
|  2|  Bob|      NULL|  NULL|2023-01-15|
|  3| NULL|  Designer|  4000|      NULL|
|  4|David|  Engineer|  NULL|2023-02-01|
+---+-----+----------+------+----------+



In [16]:
# 아무것도 변경되지 않는다 
df.show()

+---+-----+----------+------+----------+
| id| name|occupation|salary|      date|
+---+-----+----------+------+----------+
|  1|Alice|  Engineer|  5000|2023-01-10|
|  2|  Bob|      NULL|  NULL|2023-01-15|
|  3| NULL|  Designer|  4000|      NULL|
|  4|David|  Engineer|  NULL|2023-02-01|
|  5| NULL|      NULL|  NULL|      NULL|
+---+-----+----------+------+----------+



In [17]:
# 변수에 담아서 변경해야한다
thresh_drop=df.na.drop(thresh=2)
thresh_drop.show()

+---+-----+----------+------+----------+
| id| name|occupation|salary|      date|
+---+-----+----------+------+----------+
|  1|Alice|  Engineer|  5000|2023-01-10|
|  2|  Bob|      NULL|  NULL|2023-01-15|
|  3| NULL|  Designer|  4000|      NULL|
|  4|David|  Engineer|  NULL|2023-02-01|
+---+-----+----------+------+----------+



In [18]:
# fill

In [19]:
df.na.fill("Unknown").show()

+---+-------+----------+------+----------+
| id|   name|occupation|salary|      date|
+---+-------+----------+------+----------+
|  1|  Alice|  Engineer|  5000|2023-01-10|
|  2|    Bob|   Unknown|  NULL|2023-01-15|
|  3|Unknown|  Designer|  4000|      NULL|
|  4|  David|  Engineer|  NULL|2023-02-01|
|  5|Unknown|   Unknown|  NULL|      NULL|
+---+-------+----------+------+----------+



In [20]:
df.na.fill(0).show()

+---+-----+----------+------+----------+
| id| name|occupation|salary|      date|
+---+-----+----------+------+----------+
|  1|Alice|  Engineer|  5000|2023-01-10|
|  2|  Bob|      NULL|     0|2023-01-15|
|  3| NULL|  Designer|  4000|      NULL|
|  4|David|  Engineer|     0|2023-02-01|
|  5| NULL|      NULL|     0|      NULL|
+---+-----+----------+------+----------+



In [21]:
df_filled = (
    df
    .na.fill("Unknown", subset=["name", "occupation"])
    .na.fill(0, subset=["salary"])
)

df_filled.show()

+---+-------+----------+------+----------+
| id|   name|occupation|salary|      date|
+---+-------+----------+------+----------+
|  1|  Alice|  Engineer|  5000|2023-01-10|
|  2|    Bob|   Unknown|     0|2023-01-15|
|  3|Unknown|  Designer|  4000|      NULL|
|  4|  David|  Engineer|     0|2023-02-01|
|  5|Unknown|   Unknown|     0|      NULL|
+---+-------+----------+------+----------+



In [22]:
# 평균값으로 채우기 
# collect()는 이중배열이다  값을 뽑아낼수 없다! 
#[0][0] 인덱스로 뽑아야함 
df.select(F.mean("salary")).collect()

[Row(avg(salary)=4500.0)]

In [27]:
mean_val=df.select(F.mean("salary")).collect()[0][0]

In [28]:
# 왜 굳이 collect()를 할가? 
# fill()함수는 확정된값만 받기 때문이다

In [32]:
df.na.fill(mean_val,subset=["salary"])

DataFrame[id: int, name: string, occupation: string, salary: int, date: date]

In [33]:
df=df.na.fill(mean_val,subset=["salary"])

In [34]:
df.show()

+---+-----+----------+------+----------+
| id| name|occupation|salary|      date|
+---+-----+----------+------+----------+
|  1|Alice|  Engineer|  5000|2023-01-10|
|  2|  Bob|      NULL|  4500|2023-01-15|
|  3| NULL|  Designer|  4000|      NULL|
|  4|David|  Engineer|  4500|2023-02-01|
|  5| NULL|      NULL|  4500|      NULL|
+---+-----+----------+------+----------+



In [42]:
# 중앙값과 최빈값 찾는 함수가 바로 없다 
# 중앙값 ->근사치 함수
# 최빈값 : 직접 구하고 , 찾아야 한다

In [37]:
median_val = df.select(
    F.percentile_approx("salary", 0.5)
).collect()[0][0]

median_val

4500

In [38]:
data = [
    (1, "Engineer"),
    (2, "Engineer"),
    (3, "Data Scientist"),
    (4, None),
    (5, "Engineer"),
    (6, None),
    (7, "Designer"),
    (8, "Engineer"),
    (9, None),
    (10, "Designer")
]

df = spark.createDataFrame(data, ["user_id", "occupation"])
df.show()

+-------+--------------+
|user_id|    occupation|
+-------+--------------+
|      1|      Engineer|
|      2|      Engineer|
|      3|Data Scientist|
|      4|          NULL|
|      5|      Engineer|
|      6|          NULL|
|      7|      Designer|
|      8|      Engineer|
|      9|          NULL|
|     10|      Designer|
+-------+--------------+



In [39]:
mode_row = (
    df
    .groupBy("occupation")
    .count()
    .orderBy(F.desc("count"))
    .first()
)

mode_row

Row(occupation='Engineer', count=4)

In [40]:
mode_val = mode_row["occupation"]
mode_val

'Engineer'

In [41]:
df_filled = df.na.fill(mode_val, subset=["occupation"])
df_filled.show()

+-------+--------------+
|user_id|    occupation|
+-------+--------------+
|      1|      Engineer|
|      2|      Engineer|
|      3|Data Scientist|
|      4|      Engineer|
|      5|      Engineer|
|      6|      Engineer|
|      7|      Designer|
|      8|      Engineer|
|      9|      Engineer|
|     10|      Designer|
+-------+--------------+



In [43]:
spark.stop()