# PySpark DataFrame

- Colab 에서 실행

In [1]:
from pyspark.sql import SparkSession

# SparkSession은 PySpark 애플리케이션의 진입점으로,
# 데이터프레임 생성, 데이터 읽기/쓰기, SQL 작업 등을 수행할 수 있게 해줍니다.
spark = SparkSession.builder \
    .appName("DataFrame") \
    .config("spark.sql.repl.eagerEval.enabled", True) \
    .getOrCreate()

spark

### 데이터셋 읽기

In [4]:
from google.colab import drive

# Google Drive 마운트
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
 # Titanic 데이터셋 CSV 파일 경로
file_location = "/content/drive/MyDrive/Colab Notebooks/BigData/datasets/titanic.csv"
file_type = "csv"  # 파일 형식을 CSV로 지정

# CSV 파일 옵션 설정
infer_schema = "true"  # 스키마를 자동으로 추론
first_row_is_header = "true"  # 첫 번째 행을 헤더로 간주
delimiter = ","  # 필드 구분자를 쉼표(,)로 설정

# 지정한 옵션을 사용하여 CSV 파일을 읽어 DataFrame 생성
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)  # 파일 경로 지정 후 데이터 읽기

# 상위 5개 행만 확인
df.limit(5)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen ...",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. Joh...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. ...",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Ja...",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. Willia...",male,35.0,0,0,373450,8.05,,S


In [7]:
# 스키마 확인
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [8]:
type(df)

In [9]:
# DataFrame의 컬럼 이름 출력
print(df.columns)

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [10]:
# DataFrame의 상위 3개 행을 Row 객체 리스트로 반환
df.head(3)

[Row(PassengerId=1, Survived=0, Pclass=3, Name='Braund, Mr. Owen Harris', Sex='male', Age=22.0, SibSp=1, Parch=0, Ticket='A/5 21171', Fare=7.25, Cabin=None, Embarked='S'),
 Row(PassengerId=2, Survived=1, Pclass=1, Name='Cumings, Mrs. John Bradley (Florence Briggs Thayer)', Sex='female', Age=38.0, SibSp=1, Parch=0, Ticket='PC 17599', Fare=71.2833, Cabin='C85', Embarked='C'),
 Row(PassengerId=3, Survived=1, Pclass=3, Name='Heikkinen, Miss. Laina', Sex='female', Age=26.0, SibSp=0, Parch=0, Ticket='STON/O2. 3101282', Fare=7.925, Cabin=None, Embarked='S')]

In [11]:
# DataFrame의 상위 3개 행을 표 형태로 출력
df.show(3)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
only showing top 3 rows



In [12]:
# 'Name' 컬럼만 선택하여 상위 5개 행을 가져오기
df.select('Name').limit(5)

Name
"Braund, Mr. Owen ..."
"Cumings, Mrs. Joh..."
"Heikkinen, Miss. ..."
"Futrelle, Mrs. Ja..."
"Allen, Mr. Willia..."


In [13]:
# 'Name'과 'Age' 컬럼을 선택하여 상위 5개 행을 가져오기
df.select(['Name', 'Age']).limit(5)

Name,Age
"Braund, Mr. Owen ...",22.0
"Cumings, Mrs. Joh...",38.0
"Heikkinen, Miss. ...",26.0
"Futrelle, Mrs. Ja...",35.0
"Allen, Mr. Willia...",35.0


In [14]:
# DataFrame의 모든 컬럼 이름과 해당 데이터 타입을 튜플의 리스트 형태로 반환
df.dtypes

[('PassengerId', 'int'),
 ('Survived', 'int'),
 ('Pclass', 'int'),
 ('Name', 'string'),
 ('Sex', 'string'),
 ('Age', 'double'),
 ('SibSp', 'int'),
 ('Parch', 'int'),
 ('Ticket', 'string'),
 ('Fare', 'double'),
 ('Cabin', 'string'),
 ('Embarked', 'string')]

In [15]:
# DataFrame의 수치형 컬럼에 대해 기초 통계 정보를 계산
df.describe()

summary,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891,891.0,204,889
mean,446.0,0.3838383838383838,2.308641975308642,,,29.69911764705882,0.5230078563411896,0.3815937149270482,260318.54916792738,32.2042079685746,,
stddev,257.3538420152301,0.4865924542648575,0.8360712409770491,,,14.526497332334037,1.1027434322934315,0.8060572211299488,471609.26868834975,49.69342859718089,,
min,1.0,0.0,1.0,"""Andersson, Mr. A...",female,0.42,0.0,0.0,110152,0.0,A10,C
max,891.0,1.0,3.0,"van Melkebeke, Mr...",male,80.0,8.0,6.0,WE/P 5735,512.3292,T,S


In [16]:
# 'Age' 컬럼의 값이 30보다 큰지(True/False) 여부를 나타내는 새로운 컬럼 'Age Over 30'을 DataFrame에 추가
df = df.withColumn('Age Over 30', df['Age'] > 30)  # 새로운 DataFrame으로 업데이트

df.limit(5)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Over 30
1,0,3,"Braund, Mr. Owen ...",male,22.0,1,0,A/5 21171,7.25,,S,False
2,1,1,"Cumings, Mrs. Joh...",female,38.0,1,0,PC 17599,71.2833,C85,C,True
3,1,3,"Heikkinen, Miss. ...",female,26.0,0,0,STON/O2. 3101282,7.925,,S,False
4,1,1,"Futrelle, Mrs. Ja...",female,35.0,1,0,113803,53.1,C123,S,True
5,0,3,"Allen, Mr. Willia...",male,35.0,0,0,373450,8.05,,S,True


In [17]:
# Column 삭제
df = df.drop("Age Over 30")
df.limit(5)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen ...",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. Joh...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. ...",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Ja...",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. Willia...",male,35.0,0,0,373450,8.05,,S


In [18]:
## 컬럼명 변경
df = df.withColumnRenamed('Embarked', 'Port')
df.limit(5)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Port
1,0,3,"Braund, Mr. Owen ...",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. Joh...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. ...",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Ja...",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. Willia...",male,35.0,0,0,373450,8.05,,S


### Missing Value 처리

In [19]:
from pyspark.sql.functions import col, sum

# 각 컬럼의 null 값 개수 확인
df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Port
0,0,0,0,0,177,0,0,0,0,687,2


In [20]:
from pyspark.ml.feature import Imputer

# Age가 null인 경우 평균으로 채움
imputer = Imputer(
  inputCols = ['Age'],
  outputCols = ['Age_imputed']
).setStrategy('mean')

In [21]:
df = imputer.fit(df).transform(df)

In [27]:
df.limit(5)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Port,Age_imputed
1,0,3,"Braund, Mr. Owen ...",male,22.0,1,0,A/5 21171,7.25,,S,22.0
2,1,1,"Cumings, Mrs. Joh...",female,38.0,1,0,PC 17599,71.2833,C85,C,38.0
3,1,3,"Heikkinen, Miss. ...",female,26.0,0,0,STON/O2. 3101282,7.925,,S,26.0
4,1,1,"Futrelle, Mrs. Ja...",female,35.0,1,0,113803,53.1,C123,S,35.0
5,0,3,"Allen, Mr. Willia...",male,35.0,0,0,373450,8.05,,S,35.0


## Filtering

- SQL 스타일의 문자열 표현식으로 조건을 지정합니다.
"Age < 30"는 SQL 쿼리의 WHERE 절처럼 동작

In [30]:
## Age 가 30 아래인 사람만 filtering
df.filter("Age < 30").limit(5)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Port,Age_imputed
1,0,3,"Braund, Mr. Owen ...",male,22.0,1,0,A/5 21171,7.25,,S,22.0
3,1,3,"Heikkinen, Miss. ...",female,26.0,0,0,STON/O2. 3101282,7.925,,S,26.0
8,0,3,"Palsson, Master. ...",male,2.0,3,1,349909,21.075,,S,2.0
9,1,3,"Johnson, Mrs. Osc...",female,27.0,0,2,347742,11.1333,,S,27.0
10,1,2,"Nasser, Mrs. Nich...",female,14.0,1,0,237736,30.0708,,C,14.0


- PySpark의 DataFrame API를 사용하여 조건을 지정합니다.
df["Age"] < 30은 PySpark의 컬럼 객체를 사용해 조건을 작성

In [31]:
df.filter(df["Age"] < 30).limit(5)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Port,Age_imputed
1,0,3,"Braund, Mr. Owen ...",male,22.0,1,0,A/5 21171,7.25,,S,22.0
3,1,3,"Heikkinen, Miss. ...",female,26.0,0,0,STON/O2. 3101282,7.925,,S,26.0
8,0,3,"Palsson, Master. ...",male,2.0,3,1,349909,21.075,,S,2.0
9,1,3,"Johnson, Mrs. Osc...",female,27.0,0,2,347742,11.1333,,S,27.0
10,1,2,"Nasser, Mrs. Nich...",female,14.0,1,0,237736,30.0708,,C,14.0


In [33]:
## 특정 column 만 filtering
df.filter("Age < 30").select(["Pclass", "Name", "Age"]).limit(5)

Pclass,Name,Age
3,"Braund, Mr. Owen ...",22.0
3,"Heikkinen, Miss. ...",26.0
3,"Palsson, Master. ...",2.0
3,"Johnson, Mrs. Osc...",27.0
2,"Nasser, Mrs. Nich...",14.0


In [34]:
## AND, OR 조건 추가 - 나이가 20보다 어리거나 60보다 크면서 남성인 사람
df.filter(((df['Age'] < 20) | (df['Age'] > 60)) & (df['Sex'] == 'male')).limit(5)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Port,Age_imputed
8,0,3,"Palsson, Master. ...",male,2.0,3,1,349909,21.075,,S,2.0
17,0,3,"Rice, Master. Eugene",male,2.0,4,1,382652,29.125,,Q,2.0
28,0,1,"Fortune, Mr. Char...",male,19.0,3,2,19950,263.0,C23 C25 C27,S,19.0
34,0,2,"Wheadon, Mr. Edwa...",male,66.0,0,0,C.A. 24579,10.5,,S,66.0
51,0,3,"Panula, Master. J...",male,7.0,4,1,3101295,39.6875,,S,7.0


In [35]:
## Not condition
df.filter(~(df['Age'] < 60)).limit(5)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Port,Age_imputed
34,0,2,"Wheadon, Mr. Edwa...",male,66.0,0,0,C.A. 24579,10.5,,S,66.0
55,0,1,"Ostby, Mr. Engelh...",male,65.0,0,1,113509,61.9792,B30,C,65.0
97,0,1,"Goldschmidt, Mr. ...",male,71.0,0,0,PC 17754,34.6542,A5,C,71.0
117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q,70.5
171,0,1,"Van der hoef, Mr....",male,61.0,0,0,111240,33.5,B19,S,61.0


## GroupBy 와 Aggregation Function (집합 함수)

In [39]:
# 'Sex' 컬럼을 기준으로 그룹화 후, 각 그룹에서 모든 수치형 컬럼의 평균 계산
# 특정 column 만 선택
df.groupBy('Sex').mean().select(['Sex', 'avg(Age)'])

Sex,avg(Age)
female,27.915708812260537
male,30.72664459161148


In [40]:
# 특정 컬럼만 선택 후 그룹화
df[['Sex', 'Age']].groupBy('Sex').mean()

Sex,avg(Age)
female,27.915708812260537
male,30.72664459161148


In [42]:
# 성별 최고령자
df[['Sex', 'Age']].groupBy('Sex').max()

Sex,max(Age)
female,63.0
male,80.0


In [46]:
# 출항 항구별 탑승객 수
df.groupBy('Port').count()

Port,count
Q,77
,2
C,168
S,644


- PySpark DataFrame의 여러 컬럼에 대해 집계(aggregation) 연산을 동시에 수행

In [47]:
# agg 함수 이용 (group by 내포)
df.agg({"Age": "mean", "Fare": "max", "SibSp": "max"})

max(SibSp),avg(Age),max(Fare)
8,29.69911764705882,512.3292
