In [13]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, regexp_replace
from pyspark.sql import functions as F
import pandas as pd
import numpy as np

spark session 만들기

In [2]:
sp = SparkSession.builder.getOrCreate()

In [3]:
data = sp.createDataFrame([
                         Row(NAME = 'MESSI', NATION = 'ARG', NUM = 10),
                         Row(NAME = 'RONALDO', NATION = 'POR', NUM = 7),
                         Row(NAME = 'NEYMAR', NATION = 'BRA', NUM = 10),
                         Row(NAME = 'DE BRUYNE', NATION = 'BEL', NUM = 17)])

In [4]:
data.show()

+---------+------+---+
|     NAME|NATION|NUM|
+---------+------+---+
|    MESSI|   ARG| 10|
|  RONALDO|   POR|  7|
|   NEYMAR|   BRA| 10|
|DE BRUYNE|   BEL| 17|
+---------+------+---+



필터링 방법 중에 where method를 사용하는 방법도 있다.

where method를 붙이고 괄호 안에 필요한 조건 절을 작성하면 된다.

이 때,

- AND --> &
- OR --> |

In [8]:
data.where((data.NUM == 10) & (col('NATION') == 'BRA')).show()

+------+------+---+
|  NAME|NATION|NUM|
+------+------+---+
|NEYMAR|   BRA| 10|
+------+------+---+



In [35]:
data.select(regexp_replace(col('NATION'), '[A-Z]', '@')).show()

+-----------------------------------+
|regexp_replace(NATION, [A-Z], @, 1)|
+-----------------------------------+
|                                @@@|
|                                @@@|
|                                @@@|
|                                @@@|
+-----------------------------------+



alias를 써보자

In [36]:
data.select(regexp_replace(col('NATION'), '[A-Z]', '@').alias('MASKING')).show()

+-------+
|MASKING|
+-------+
|    @@@|
|    @@@|
|    @@@|
|    @@@|
+-------+



아예 column을 새로 만들어 보자

In [39]:
data = data.withColumn('MASKING', regexp_replace(col('NATION'), '[A-Z]', '@'))
data.show()

+---------+------+---+-------+
|     NAME|NATION|NUM|MASKING|
+---------+------+---+-------+
|    MESSI|   ARG| 10|    @@@|
|  RONALDO|   POR|  7|    @@@|
|   NEYMAR|   BRA| 10|    @@@|
|DE BRUYNE|   BEL| 17|    @@@|
+---------+------+---+-------+



### 사용자 정의 함수(User Defined Functions : UDF)

In [42]:
from pyspark.sql.functions import udf

In [43]:
def my_func(x) :
    return x ** 3

In [44]:
udf_myfunc = udf(my_func)

In [45]:
data = data.withColumn('Num**3', udf_myfunc(col('NUM')))

In [46]:
data.show()

+---------+------+---+-------+------+
|     NAME|NATION|NUM|MASKING|Num**3|
+---------+------+---+-------+------+
|    MESSI|   ARG| 10|    @@@|  1000|
|  RONALDO|   POR|  7|    @@@|   343|
|   NEYMAR|   BRA| 10|    @@@|  1000|
|DE BRUYNE|   BEL| 17|    @@@|  4913|
+---------+------+---+-------+------+



In [94]:
sp = SparkSession.builder.getOrCreate()

In [108]:
data = sp.createDataFrame([
    Row(CSNO = 'A300', RELEVANCE = [8, 5, 3, 9], DATE = '0213'),
    Row(CSNO = 'A300', RELEVANCE = [3, 1, 6, 4], DATE = '0214')
])

In [109]:
data.show()

+----+------------+----+
|CSNO|   RELEVANCE|DATE|
+----+------------+----+
|A300|[8, 5, 3, 9]|0213|
|A300|[3, 1, 6, 4]|0214|
+----+------------+----+



In [110]:
def get_ideal(x) :
    return sorted(x)

In [111]:
udf_sort = udf(get_ideal)

In [112]:
data = data.withColumn('IDEAL', udf_sort(col('RELEVANCE')))

In [113]:
data.show()

+----+------------+----+------------+
|CSNO|   RELEVANCE|DATE|       IDEAL|
+----+------------+----+------------+
|A300|[8, 5, 3, 9]|0213|[3, 5, 8, 9]|
|A300|[3, 1, 6, 4]|0214|[1, 3, 4, 6]|
+----+------------+----+------------+



In [114]:
def get_CDG(x) :
    cdg = 0
    for v in x :
        cdg += v
    return cdg

In [115]:
udf_cdg = udf(get_CDG)

In [116]:
data = data.withColumn('CDG', udf_cdg(col('RELEVANCE')))

In [117]:
data.show()

+----+------------+----+------------+---+
|CSNO|   RELEVANCE|DATE|       IDEAL|CDG|
+----+------------+----+------------+---+
|A300|[8, 5, 3, 9]|0213|[3, 5, 8, 9]| 25|
|A300|[3, 1, 6, 4]|0214|[1, 3, 4, 6]| 14|
+----+------------+----+------------+---+



### LIT(Literal value로 column 생성)

In [118]:
from pyspark.sql.functions import lit

In [121]:
import datetime

In [127]:
data.withColumn('DATETIME', lit(datetime.datetime.now().strftime('%Y%m%d'))).show()

+----+------------+----+------------+---+--------+
|CSNO|   RELEVANCE|DATE|       IDEAL|CDG|DATETIME|
+----+------------+----+------------+---+--------+
|A300|[8, 5, 3, 9]|0213|[3, 5, 8, 9]| 25|20220213|
|A300|[3, 1, 6, 4]|0214|[1, 3, 4, 6]| 14|20220213|
+----+------------+----+------------+---+--------+



In [129]:
data.withColumn('INDEX', lit(2)).show()

+----+------------+----+------------+---+-----+
|CSNO|   RELEVANCE|DATE|       IDEAL|CDG|INDEX|
+----+------------+----+------------+---+-----+
|A300|[8, 5, 3, 9]|0213|[3, 5, 8, 9]| 25|    2|
|A300|[3, 1, 6, 4]|0214|[1, 3, 4, 6]| 14|    2|
+----+------------+----+------------+---+-----+

