## Spark 세션 생성

In [1]:
from pyspark.sql import SparkSession 
spark = SparkSession.builder.master('local[*]').appName('ml').getOrCreate()
spark

## 라이브러리 불러오기

In [2]:
import numpy as np
import pandas as pd 
import matplotlib as mpl
import seaborn as sns 
import matplotlib.pyplot as plt

print(np.__version__)

1.23.4


## 데이터 불러오기

In [3]:
flights = spark.read.csv('data/flights.csv', 
                         sep=',', 
                         header=True, 
                         inferSchema=True, 
                         nullValue = 'NA')

flights.show(5)

                                                                                

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351| null|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65| null|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 5 rows



## 머신러닝 주제
- 지연시간 예측
    + 종속변수 : delay
- 데이터의 행/열의 갯수

In [5]:
flights.count()

50000

In [6]:
len(flights.columns)

10

In [7]:
flights.dtypes

[('mon', 'int'),
 ('dom', 'int'),
 ('dow', 'int'),
 ('carrier', 'string'),
 ('flight', 'int'),
 ('org', 'string'),
 ('mile', 'int'),
 ('depart', 'double'),
 ('duration', 'int'),
 ('delay', 'int')]

## 데이터 전처리 & 피처엔지니어링
- flight 컬럼 삭제

In [8]:
flights2 = flights.drop('flight')
len(flights2.columns)

9

In [9]:
flights2.filter('delay IS NULL').count()

2978

- NA 결측치 삭제

In [10]:
flights3 = flights2.filter('delay IS NOT NULL')
flights3.count()

47022

In [13]:
flights4 = flights3.dropna()
flights4.count()

47022

- NA 채우기

In [14]:
flights5 = flights2.fillna({'delay' : 0})
flights5.show(5)

+---+---+---+-------+---+----+------+--------+-----+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|
+---+---+---+-------+---+----+------+--------+-----+
| 11| 20|  6|     US|JFK|2153|  9.48|     351|    0|
|  0| 22|  2|     UA|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|SFO| 337|  6.17|      82|   -8|
|  9| 13|  1|     AA|ORD|1236| 10.33|     195|   -5|
|  4|  2|  5|     AA|ORD| 258|  8.92|      65|    0|
+---+---+---+-------+---+----+------+--------+-----+
only showing top 5 rows



In [15]:
flights5.count()

50000

## 파생변수 만들기
- withColumn() 활용해서 새로운 변수를 추가할 것
    + 참조 : https://spark.apache.org/docs/3.1.3/api/python/reference/api/pyspark.sql.DataFrame.withColumn.html