# DataFrame Filter Operation

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("DataFrame").config("spark.sql.repl.eagerEval.enabled", True).getOrCreate()
spark

In [2]:
# File location and type
file_location = "datasets/titanic.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

df.limit(5)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen ...",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. Joh...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. ...",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Ja...",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. Willia...",male,35.0,0,0,373450,8.05,,S


In [3]:
## Age 가 30 아래인 사람만 filtering
df.filter("Age < 30").show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          8|       0|     3|Palsson, Master. ...|  male| 2.0|    3|    1|          349909| 21.075| null|       S|
|          9|       1|     3|Johnson, Mrs. Osc...|female|27.0|    0|    2|          347742|11.1333| null|       S|
|         10|       1|     2|Nasser, Mrs. Nich...|female|14.0|    1|    0|          237736|30.0708| null|       C|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [4]:
## 또 다른 표현 방법
df.filter(df["Age"] < 30).show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          8|       0|     3|Palsson, Master. ...|  male| 2.0|    3|    1|          349909| 21.075| null|       S|
|          9|       1|     3|Johnson, Mrs. Osc...|female|27.0|    0|    2|          347742|11.1333| null|       S|
|         10|       1|     2|Nasser, Mrs. Nich...|female|14.0|    1|    0|          237736|30.0708| null|       C|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [5]:
## 특정 column 만 filtering
df.filter("Age < 30").select(["Pclass", "Name", "Age"]).show(5)

+------+--------------------+----+
|Pclass|                Name| Age|
+------+--------------------+----+
|     3|Braund, Mr. Owen ...|22.0|
|     3|Heikkinen, Miss. ...|26.0|
|     3|Palsson, Master. ...| 2.0|
|     3|Johnson, Mrs. Osc...|27.0|
|     2|Nasser, Mrs. Nich...|14.0|
+------+--------------------+----+
only showing top 5 rows



In [6]:
## AND, OR 조건 추가 - 나이가 20보다 어리거나 60보다 크면서 남성인 사람
df.filter(((df['Age'] < 20) | (df['Age'] > 60)) & (df['Sex'] == 'male')).show(5)

+-----------+--------+------+--------------------+----+----+-----+-----+----------+-------+-----------+--------+
|PassengerId|Survived|Pclass|                Name| Sex| Age|SibSp|Parch|    Ticket|   Fare|      Cabin|Embarked|
+-----------+--------+------+--------------------+----+----+-----+-----+----------+-------+-----------+--------+
|          8|       0|     3|Palsson, Master. ...|male| 2.0|    3|    1|    349909| 21.075|       null|       S|
|         17|       0|     3|Rice, Master. Eugene|male| 2.0|    4|    1|    382652| 29.125|       null|       Q|
|         28|       0|     1|Fortune, Mr. Char...|male|19.0|    3|    2|     19950|  263.0|C23 C25 C27|       S|
|         34|       0|     2|Wheadon, Mr. Edwa...|male|66.0|    0|    0|C.A. 24579|   10.5|       null|       S|
|         51|       0|     3|Panula, Master. J...|male| 7.0|    4|    1|   3101295|39.6875|       null|       S|
+-----------+--------+------+--------------------+----+----+-----+-----+----------+-------+-----

In [7]:
## Not condition
df.filter(~(df['Age'] < 60)).show(5)

+-----------+--------+------+--------------------+----+----+-----+-----+----------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name| Sex| Age|SibSp|Parch|    Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+----+----+-----+-----+----------+-------+-----+--------+
|         34|       0|     2|Wheadon, Mr. Edwa...|male|66.0|    0|    0|C.A. 24579|   10.5| null|       S|
|         55|       0|     1|Ostby, Mr. Engelh...|male|65.0|    0|    1|    113509|61.9792|  B30|       C|
|         97|       0|     1|Goldschmidt, Mr. ...|male|71.0|    0|    0|  PC 17754|34.6542|   A5|       C|
|        117|       0|     3|Connors, Mr. Patrick|male|70.5|    0|    0|    370369|   7.75| null|       Q|
|        171|       0|     1|Van der hoef, Mr....|male|61.0|    0|    0|    111240|   33.5|  B19|       S|
+-----------+--------+------+--------------------+----+----+-----+-----+----------+-------+-----+--------+
only showing top 5 rows



## GroupBy 와 Aggregation Function

In [8]:
df.groupBy('Sex').mean().select(['Sex', 'avg(Age)']).show()

+------+------------------+
|   Sex|          avg(Age)|
+------+------------------+
|female|27.915708812260537|
|  male| 30.72664459161148|
+------+------------------+



In [9]:
# 성별 평균 나이
df[['Sex', 'Age']].groupBy('Sex').mean().show()

+------+------------------+
|   Sex|          avg(Age)|
+------+------------------+
|female|27.915708812260537|
|  male| 30.72664459161148|
+------+------------------+



In [10]:
# 성별 최고령자
df[['Sex', 'Age']].groupBy('Sex').max().show()

+------+--------+
|   Sex|max(Age)|
+------+--------+
|female|    63.0|
|  male|    80.0|
+------+--------+



In [11]:
## 출항 항구별 탑승객 수 
df.groupBy('Embarked').count().show()

+--------+-----+
|Embarked|count|
+--------+-----+
|       Q|   77|
|    null|    2|
|       C|  168|
|       S|  644|
+--------+-----+



In [12]:
## agg 함수 이용 (group by 내포)
df.agg({"Age": "mean", "Fare": "max", "SibSp": "max"}).show()

+----------+-----------------+---------+
|max(SibSp)|         avg(Age)|max(Fare)|
+----------+-----------------+---------+
|         8|29.69911764705882| 512.3292|
+----------+-----------------+---------+

