In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!apt-get install openjdk-8-jdk-headless
!wget -q https://dlcdn.apache.org/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz
!tar -zxf spark-3.5.1-bin-hadoop3.tgz

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.1-bin-hadoop3"

!pip install findspark -q

import findspark
findspark.init()

import pyspark
spark_version = pyspark.__version__
print("Apache Spark 버전 확인: " + spark_version)

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libxtst6 openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra fonts-nanum fonts-ipafont-gothic
  fonts-ipafont-mincho fonts-wqy-microhei fonts-wqy-zenhei fonts-indic
The following NEW packages will be installed:
  libxtst6 openjdk-8-jdk-headless openjdk-8-jre-headless
0 upgraded, 3 newly installed, 0 to remove and 45 not upgraded.
Need to get 39.7 MB of archives.
After this operation, 144 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libxtst6 amd64 2:1.2.3-1build4 [13.4 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 openjdk-8-jre-headless amd64 8u402-ga-2ubuntu1~22.04 [30.8 MB]
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 openjdk-8-jdk-headless amd64 8u402-ga-2ubuntu1~22.04 [8,873 kB]

## Spark 세션 설정

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[1]').appName('ML').getOrCreate()
spark

# 데이터 불러오기

In [4]:
DATA_PATH = "/content/drive/MyDrive/Colab Notebooks/data/"

flights = spark.read.option('header', 'true').csv(DATA_PATH + 'flight_small.csv')
flights.show(1)

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|   6|    58|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
only showing top 1 row



In [5]:
planes = spark.read.option('header', 'true').csv(DATA_PATH + 'planes.csv')
planes.show(1)

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
only showing top 1 row



## 데이터 가공

### 1단계
- planes의 year ==> plane_year
- 두 개의 테이블을 join하기
- flights 기준으로 left join


In [8]:
planes = planes.withColumnRenamed('year', 'plane_year')

model_data = flights.join(planes, on = 'tailnum', how = 'leftouter')
model_data.show(1)

+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+----------+--------------------+------------+--------+-------+-----+-----+---------+
|tailnum|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|flight|origin|dest|air_time|distance|hour|minute|plane_year|                type|manufacturer|   model|engines|seats|speed|   engine|
+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+----------+--------------------+------------+--------+-------+-----+-----+---------+
| N846VA|2014|   12|  8|     658|       -7|     935|       -5|     VX|  1780|   SEA| LAX|     132|     954|   6|    58|      2011|Fixed wing multi ...|      AIRBUS|A320-214|      2|  182|   NA|Turbo-fan|
+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+----------+--------------------+------------+--------+-------+---

- arr_delay, air_time, month, plane_year, integer로 변경

In [10]:
cols = ['arr_delay', 'air_time', 'month', 'plane_year']

for col in cols:
  model_data = model_data.withColumn(col, model_data.distance.cast('int'))

In [11]:
model_data.printSchema()

root
 |-- tailnum: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: string (nullable = true)
 |-- dep_time: string (nullable = true)
 |-- dep_delay: string (nullable = true)
 |-- arr_time: string (nullable = true)
 |-- arr_delay: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- air_time: integer (nullable = true)
 |-- distance: string (nullable = true)
 |-- hour: string (nullable = true)
 |-- minute: string (nullable = true)
 |-- plane_year: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- engines: string (nullable = true)
 |-- seats: string (nullable = true)
 |-- speed: string (nullable = true)
 |-- engine: string (nullable = true)



- 새로운 컬럼 작성
  + plane_age라는 컬럼을 생성함

In [12]:
model_data = model_data.withColumn('plane_age', model_data.year-model_data.plane_year)
model_data.show(10)

+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+----------+--------------------+------------+--------+-------+-----+-----+---------+---------+
|tailnum|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|flight|origin|dest|air_time|distance|hour|minute|plane_year|                type|manufacturer|   model|engines|seats|speed|   engine|plane_age|
+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+----------+--------------------+------------+--------+-------+-----+-----+---------+---------+
| N846VA|2014|  954|  8|     658|       -7|     935|      954|     VX|  1780|   SEA| LAX|     954|     954|   6|    58|       954|Fixed wing multi ...|      AIRBUS|A320-214|      2|  182|   NA|Turbo-fan|   1060.0|
| N559AS|2014| 2677| 22|    1040|        5|    1505|     2677|     AS|   851|   SEA| HNL|    2677|    2677|  10|    40|      2677|Fixed wing mul

- 컬럼명 is_late : 도착시간(arr_delay)을 기준으로 해서 양수가 나오면 늦음. True
- withColumn() 내부에서 연산

In [16]:
import pyspark.sql.functions as F
model_data = model_data.withColumn('is_late', F.when(model_data.arr_delay > 0, True).otherwise(False))
model_data.show()

+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+----------+--------------------+--------------+-----------+-------+-----+-----+---------+---------+-------+
|tailnum|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|flight|origin|dest|air_time|distance|hour|minute|plane_year|                type|  manufacturer|      model|engines|seats|speed|   engine|plane_age|is_late|
+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+----------+--------------------+--------------+-----------+-------+-----+-----+---------+---------+-------+
| N846VA|2014|  954|  8|     658|       -7|     935|      954|     VX|  1780|   SEA| LAX|     954|     954|   6|    58|       954|Fixed wing multi ...|        AIRBUS|   A320-214|      2|  182|   NA|Turbo-fan|   1060.0|   true|
| N559AS|2014| 2677| 22|    1040|        5|    1505|     2677|     AS|   851|   SEA| HNL|   

In [17]:
#label column 생성 from is_late ==> Integer
model_data = model_data.withColumn("label", model_data.is_late.cast("integer"))
model_data.show(2)

+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+----------+--------------------+------------+--------+-------+-----+-----+---------+---------+-------+-----+
|tailnum|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|flight|origin|dest|air_time|distance|hour|minute|plane_year|                type|manufacturer|   model|engines|seats|speed|   engine|plane_age|is_late|label|
+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+----------+--------------------+------------+--------+-------+-----+-----+---------+---------+-------+-----+
| N846VA|2014|  954|  8|     658|       -7|     935|      954|     VX|  1780|   SEA| LAX|     954|     954|   6|    58|       954|Fixed wing multi ...|      AIRBUS|A320-214|      2|  182|   NA|Turbo-fan|   1060.0|   true|    1|
| N559AS|2014| 2677| 22|    1040|        5|    1505|     2677|     AS|   851|   SEA| HNL

In [19]:
# 결측치가 있는 컬럼 제거
model_data = model_data.filter("arr_delay is not NULL and dep_delay is not NULL and air_time is not NULL and plane_year is not NULL")

# FE
- VectorAssembler
 + Pipeline 구축
 + StringIndexer
 + OneHotEncoder
- 모든 input data는 숫자여야 해서, 모두 변경을 해야 함
- Spark ML 클래스가 요구하는 데이터 양식에 맞춰야 한다.


In [24]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

carr_indexer = StringIndexer(inputCol="carrier", outputCol="carrier_index")
carr_encoder = OneHotEncoder(inputCol="carrier_index", outputCol="carrier_fact")
dest_indexer = StringIndexer(inputCol="dest", outputCol="dest_index")
dest_encoder = OneHotEncoder(inputCol="dest_index", outputCol="dest_fact")
vec_assembler = VectorAssembler(inputCols=["month", "air_time", "carrier_fact", "dest_fact", "plane_age"], outputCol="features")

flights_pipe = Pipeline(stages=[dest_indexer, dest_encoder, carr_indexer, carr_encoder, vec_assembler])
piped_data = flights_pipe.fit(model_data).transform(model_data)
piped_data.show(1)

+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+----------+--------------------+------------+--------+-------+-----+-----+---------+---------+-------+-----+----------+--------------+-------------+--------------+--------------------+
|tailnum|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|flight|origin|dest|air_time|distance|hour|minute|plane_year|                type|manufacturer|   model|engines|seats|speed|   engine|plane_age|is_late|label|dest_index|     dest_fact|carrier_index|  carrier_fact|            features|
+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+----------+--------------------+------------+--------+-------+-----+-----+---------+---------+-------+-----+----------+--------------+-------------+--------------+--------------------+
| N846VA|2014|  954|  8|     658|       -7|     935|      954|     VX|  1780|   SEA| LAX

In [25]:
training, test = piped_data.randomSplit([0.6, 0.4])

모델 생성

In [26]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression()

교차검증(Grid Search)

In [27]:
import pyspark.ml.tuning as tune
import numpy as np

grid = tune.ParamGridBuilder()

# 파라미터 튜닝 인수 추가
grid = grid.addGrid(lr.regParam, np.arange(0, .1, .01))
grid = grid.addGrid(lr.elasticNetParam, [0, 1])

grid = grid.build()

- 교차검증 : 평가지표가 필요

In [29]:
import pyspark.ml.evaluation as evals
evaluator = evals.BinaryClassificationEvaluator(metricName="areaUnderROC")

In [30]:
cv = tune.CrossValidator(estimator = lr,
                         estimatorParamMaps = grid,
                         evaluator = evaluator)

cv

CrossValidator_5916adefe12b

In [31]:
final_lr = cv.fit(training)

In [32]:
print(final_lr)

CrossValidatorModel_1e1f581b860b


In [33]:
test_results = final_lr.transform(test)
test_results.show(1)

+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+----------+----+------------+-----+-------+-----+-----+------+---------+-------+-----+----------+---------------+-------------+--------------+--------------------+--------------------+-----------+----------+
|tailnum|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|flight|origin|dest|air_time|distance|hour|minute|plane_year|type|manufacturer|model|engines|seats|speed|engine|plane_age|is_late|label|dest_index|      dest_fact|carrier_index|  carrier_fact|            features|       rawPrediction|probability|prediction|
+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+----------+----+------------+-----+-------+-----+-----+------+---------+-------+-----+----------+---------------+-------------+--------------+--------------------+--------------------+-----------+----------+
| D942DN|2014| 1426

In [34]:
evaluator.evaluate(test_results)

1.0