# 실습:  PySpark Basic

### 1) 기본 Spark 작업

In [28]:
# PySpark 모듈 및 필요한 함수들 임포트
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# SparkSession 생성
# - builder: SparkSession을 구성하는 빌더 객체를 반환
# - appName("SparkAPP"): 애플리케이션 이름 설정
# - getOrCreate(): 기존 SparkSession이 없으면 생성, 있으면 반환
spark = (SparkSession
         .builder
         .appName("SparkAPP")
         .config("spark.sql.repl.eagerEval.enabled", True)
         .getOrCreate())

In [30]:
# CSV 파일 경로 지정
csv_file = "/content/onlinefoods.csv"

# Spark 데이터프레임 생성
# - format("csv"): CSV 형식의 파일을 읽도록 설정
# - option("header", "true"): 첫 번째 줄을 헤더(컬럼 이름)로 사용
# - option("inferSchema", "true"): 데이터 타입을 자동으로 추론
# - load(csv_file): 지정된 경로의 CSV 파일을 로드
df = spark.read \
      .format("csv") \
      .option("header", "true") \
      .option("inferSchema", "true") \
      .load(csv_file)

In [31]:
# 데이터프레임에서 상위 5개 행 조회
df.limit(5)

Age,Gender,Marital Status,Occupation,Monthly Income,Educational Qualifications,Family size,latitude,longitude,Pin code,Output,Feedback,_c12
20,Female,Single,Student,No Income,Post Graduate,4,12.9766,77.5993,560001,Yes,Positive,Yes
24,Female,Single,Student,Below Rs.10000,Graduate,3,12.977,77.5773,560009,Yes,Positive,Yes
22,Male,Single,Student,Below Rs.10000,Post Graduate,3,12.9551,77.6593,560017,Yes,Negative,Yes
22,Female,Single,Student,No Income,Graduate,6,12.9473,77.5616,560019,Yes,Positive,Yes
22,Male,Single,Student,Below Rs.10000,Post Graduate,4,12.985,77.5533,560010,Yes,Positive,Yes


In [32]:
# 데이터프레임의 요약 통계 계산
df.describe()

summary,Age,Gender,Marital Status,Occupation,Monthly Income,Educational Qualifications,Family size,latitude,longitude,Pin code,Output,Feedback,_c12
count,388.0,388,388,388,388,388,388.0,388.0,388.0,388.0,388,388,388
mean,24.628865979381445,,,,,,3.2809278350515463,12.972057989690706,77.60015953608251,560040.1134020619,,,
stddev,2.975592660672904,,,,,,1.351024939645313,0.0444892486281059,0.0513539170127217,31.39960871026137,,,
min,18.0,Female,Married,Employee,10001 to 25000,Graduate,1.0,12.8652,77.4842,560001.0,No,Negative,No
max,33.0,Male,Single,Student,No Income,Uneducated,6.0,13.102,77.7582,560109.0,Yes,Positive,Yes


In [33]:
# 데이터프레임의 스키마 출력
df.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Marital Status: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Monthly Income: string (nullable = true)
 |-- Educational Qualifications: string (nullable = true)
 |-- Family size: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- Pin code: integer (nullable = true)
 |-- Output: string (nullable = true)
 |-- Feedback: string (nullable = true)
 |-- _c12: string (nullable = true)



In [34]:
# 데이터프레임이 비어있는지 확인
df.isEmpty()

False

In [35]:
# 데이터프레임의 컬럼 이름 리스트 출력
df.columns

['Age',
 'Gender',
 'Marital Status',
 'Occupation',
 'Monthly Income',
 'Educational Qualifications',
 'Family size',
 'latitude',
 'longitude',
 'Pin code',
 'Output',
 'Feedback',
 '_c12']

In [36]:
# 데이터프레임의 전체 행 수 계산
df.count()

388

### 2) 선택, 필터링, 집계

In [37]:
# 특정 컬럼 선택 및 데이터 확인
# - select('컬럼1', '컬럼2'): 지정된 컬럼만 선택하여 새로운 데이터프레임 생성
# - 'Age', 'Monthly Income': 선택할 컬럼 이름
# - show(): 선택된 데이터프레임의 내용을 출력 (기본적으로 상위 20개 행을 표시)
new_df = df.select('Age', 'Monthly Income')
new_df.show(5)

+---+--------------+
|Age|Monthly Income|
+---+--------------+
| 20|     No Income|
| 24|Below Rs.10000|
| 22|Below Rs.10000|
| 22|     No Income|
| 22|Below Rs.10000|
+---+--------------+
only showing top 5 rows



In [38]:
#  데이터프레임의 상위 2개 행을 리스트로 반환
first_two_rows = df.take(2)

# 반환된 행들을 하나씩 순회하며 출력
for row in first_two_rows:
    print(row)

Row(Age=20, Gender='Female', Marital Status='Single', Occupation='Student', Monthly Income='No Income', Educational Qualifications='Post Graduate', Family size=4, latitude=12.9766, longitude=77.5993, Pin code=560001, Output='Yes', Feedback='Positive', _c12='Yes')
Row(Age=24, Gender='Female', Marital Status='Single', Occupation='Student', Monthly Income='Below Rs.10000', Educational Qualifications='Graduate', Family size=3, latitude=12.977, longitude=77.5773, Pin code=560009, Output='Yes', Feedback='Positive', _c12='Yes')


In [39]:
# 데이터프레임에서 첫 번째 행 가져오기
first_row = df.first()

# Row 객체를 딕셔너리로 변환
# - asDict(): Row 객체를 Python 딕셔너리 형태로 변환
row_dict = first_row.asDict()
row_dict

{'Age': 20,
 'Gender': 'Female',
 'Marital Status': 'Single',
 'Occupation': 'Student',
 'Monthly Income': 'No Income',
 'Educational Qualifications': 'Post Graduate',
 'Family size': 4,
 'latitude': 12.9766,
 'longitude': 77.5993,
 'Pin code': 560001,
 'Output': 'Yes',
 'Feedback': 'Positive',
 '_c12': 'Yes'}

In [40]:
# 다음 조건에 따라 새로운 "Post_Graduate_Customer" 컬럼을 데이터프레임에 추가
# "Educational Qualifications" 값이 "Post_Graduate"이면 "Yes" 그 외의 값이면 "No"
df = df.withColumn("Post_Graduate_Customer", when(df["Educational Qualifications"] == "Post_Graduate", "Yes").otherwise("No"))

df.limit(5)

Age,Gender,Marital Status,Occupation,Monthly Income,Educational Qualifications,Family size,latitude,longitude,Pin code,Output,Feedback,_c12,Post_Graduate_Customer
20,Female,Single,Student,No Income,Post Graduate,4,12.9766,77.5993,560001,Yes,Positive,Yes,No
24,Female,Single,Student,Below Rs.10000,Graduate,3,12.977,77.5773,560009,Yes,Positive,Yes,No
22,Male,Single,Student,Below Rs.10000,Post Graduate,3,12.9551,77.6593,560017,Yes,Negative,Yes,No
22,Female,Single,Student,No Income,Graduate,6,12.9473,77.5616,560019,Yes,Positive,Yes,No
22,Male,Single,Student,Below Rs.10000,Post Graduate,4,12.985,77.5533,560010,Yes,Positive,Yes,No


In [41]:
# 컬럼 이름 변경: "Occupation" -> "Job Status"
# - withColumnRenamed("기존 컬럼명", "새 컬럼명"): 지정한 기존 컬럼의 이름을 새로운 이름으로 변경
df = df.withColumnRenamed("Occupation", "Job Status")

df.limit(5)

Age,Gender,Marital Status,Job Status,Monthly Income,Educational Qualifications,Family size,latitude,longitude,Pin code,Output,Feedback,_c12,Post_Graduate_Customer
20,Female,Single,Student,No Income,Post Graduate,4,12.9766,77.5993,560001,Yes,Positive,Yes,No
24,Female,Single,Student,Below Rs.10000,Graduate,3,12.977,77.5773,560009,Yes,Positive,Yes,No
22,Male,Single,Student,Below Rs.10000,Post Graduate,3,12.9551,77.6593,560017,Yes,Negative,Yes,No
22,Female,Single,Student,No Income,Graduate,6,12.9473,77.5616,560019,Yes,Positive,Yes,No
22,Male,Single,Student,Below Rs.10000,Post Graduate,4,12.985,77.5533,560010,Yes,Positive,Yes,No


In [42]:
# 특정 컬럼 삭제: "Post_Graduate_Customer"
df = df.drop("Post_Graduate_Customer")
df.limit(3)

Age,Gender,Marital Status,Job Status,Monthly Income,Educational Qualifications,Family size,latitude,longitude,Pin code,Output,Feedback,_c12
20,Female,Single,Student,No Income,Post Graduate,4,12.9766,77.5993,560001,Yes,Positive,Yes
24,Female,Single,Student,Below Rs.10000,Graduate,3,12.977,77.5773,560009,Yes,Positive,Yes
22,Male,Single,Student,Below Rs.10000,Post Graduate,3,12.9551,77.6593,560017,Yes,Negative,Yes


In [43]:
# 데이터프레임에서 중복된 행을 제거하고 고유한 행들만 포함한 새로운 데이터프레임 반환
distinct_df = df.distinct()
distinct_df.limit(5)

Age,Gender,Marital Status,Job Status,Monthly Income,Educational Qualifications,Family size,latitude,longitude,Pin code,Output,Feedback,_c12
21,Male,Single,Student,No Income,Post Graduate,4,12.977,77.5773,560009,Yes,Positive,Yes
31,Male,Married,Employee,More than 50000,Ph.D,5,12.9119,77.6446,560102,Yes,Positive,Yes
32,Male,Married,Employee,More than 50000,Ph.D,5,12.9635,77.5821,560002,Yes,Negative,Yes
25,Male,Single,Student,No Income,Post Graduate,6,13.0012,77.5995,560046,Yes,Positive,Yes
21,Female,Single,Employee,Below Rs.10000,Graduate,2,12.9925,77.5633,560021,No,Negative,No


In [47]:
# 데이터프레임의 각 열의 데이터 유형 출력
print(distinct_df.dtypes)

[('Age', 'int'), ('Gender', 'string'), ('Marital Status', 'string'), ('Job Status', 'string'), ('Monthly Income', 'string'), ('Educational Qualifications', 'string'), ('Family size', 'int'), ('latitude', 'double'), ('longitude', 'double'), ('Pin code', 'int'), ('Output', 'string'), ('Feedback', 'string'), ('_c12', 'string')]


In [48]:
# 나이가 28보다 크고 결혼 상태가 "Single"인 데이터를 필터링합니다.
filtered_df = distinct_df.filter((col("Age") > 28) & (col("Marital Status") == "Single"))

filtered_df.limit(5)

Age,Gender,Marital Status,Job Status,Monthly Income,Educational Qualifications,Family size,latitude,longitude,Pin code,Output,Feedback,_c12
29,Male,Single,Self Employeed,More than 50000,Graduate,6,12.8845,77.6036,560076,Yes,Positive,Yes
29,Male,Single,Employee,25001 to 50000,Graduate,3,13.0641,77.5931,560092,No,Negative,No


In [49]:
# PySpark 데이터프레임의 모든 데이터를 수집하여 Python 리스트로 반환
collected_data = df.collect()

collected_data

[Row(Name='Alice', Age=34, Marital Status='Single'),
 Row(Name='Bob', Age=28, Marital Status='Married'),
 Row(Name='Charlie', Age=31, Marital Status='Divorced')]