# 원본 데이터(csv) jsonl 및 parquet로 만들기


실제 항공정보가 하나씩 json객체로 스트리밍으로 전송된다고 가정.

전송된 json 객체는 kinesis stream, firehose를 거쳐서 s3에 parquet로 저장되게 설계함.

스트리밍으로 전송되는 것을 구현하기 위해 원본데이터(csv)를 json으로 변환해 kinesis로 전송되게 한다.

모든 원본데이터를 전송 할 것은 아니므로 원본데이터의 일부분을 때어내서 json으로 만들고, 만든 데이터를 읽어들여 전송한다.(원본 데이터 용량이 2.6G나 된다.)

결과적으로 s3에는 parquet로 변환된 데이터가 쌓이게 되는데, 모든 데이터를 전송했다고 가정하고 원본데이터(csv)를 줄이지 않고 parquet로 변환하여 저장한다!


# 원본 데이터 jsonl으로 만들기

In [1]:
# 원본 s3에서 가져오기 ** 이미 있음 실행 안해도 됌!
!aws s3 cp s3://jhw620/Airplane_bigdata/On_Time_On_Time_Performance_2015.csv ../data

download: s3://jhw620/Airplane_bigdata/On_Time_On_Time_Performance_2015.csv to ../data/On_Time_On_Time_Performance_2015.csv


In [2]:
# spark 객체 생성
try:
  sc and spark
except NameError as e:
  import findspark
  findspark.init()
  import pyspark
  import pyspark.sql

  sc = pyspark.SparkContext()
  spark = pyspark.sql.SparkSession(sc).builder.getOrCreate()

In [3]:
# CSV 파일 읽어오기
on_time_dataframe = spark.read.csv(
    '../data/On_Time_On_Time_Performance_2015.csv',header=True)
on_time_dataframe.limit(10).head()

Row(Year='2015', Quarter='1', Month='1', DayofMonth='1', DayOfWeek='4', FlightDate='2015-01-01', UniqueCarrier='AA', AirlineID='19805', Carrier='AA', TailNum='N001AA', FlightNum='1519', OriginAirportID='11298', OriginAirportSeqID='1129803', OriginCityMarketID='30194', Origin='DFW', OriginCityName='Dallas/Fort Worth, TX', OriginState='TX', OriginStateFips='48', OriginStateName='Texas', OriginWac='74', DestAirportID='13244', DestAirportSeqID='1324402', DestCityMarketID='33244', Dest='MEM', DestCityName='Memphis, TN', DestState='TN', DestStateFips='47', DestStateName='Tennessee', DestWac='54', CRSDepTime='1345', DepTime='1342', DepDelay='-3.00', DepDelayMinutes='0.00', DepDel15='0.00', DepartureDelayGroups='-1', DepTimeBlk='1300-1359', TaxiOut='16.00', WheelsOff='1358', WheelsOn='1457', TaxiIn='7.00', CRSArrTime='1510', ArrTime='1504', ArrDelay='-6.00', ArrDelayMinutes='0.00', ArrDel15='0.00', ArrivalDelayGroups='-1', ArrTimeBlk='1500-1559', Cancelled='0.00', CancellationCode=None, Divert

In [3]:
# 읽어온 데이터를 테이블화 한다.
on_time_dataframe.registerTempTable("on_time_performance")

# sql로 테이블화 한 데이터를 읽어온다.
trimmed_cast_performance = spark.sql("""
SELECT
  Year, Quarter, Month, DayofMonth, DayOfWeek, FlightDate,
  Carrier, TailNum, FlightNum,
  Origin, OriginCityName, OriginState,
  Dest, DestCityName, DestState,
  DepTime, cast(DepDelay as float), cast(DepDelayMinutes as int),
  cast(TaxiOut as float), cast(TaxiIn as float),
  WheelsOff, WheelsOn,
  ArrTime, cast(ArrDelay as float), cast(ArrDelayMinutes as float),
  cast(Cancelled as int), cast(Diverted as int),
  cast(ActualElapsedTime as float), cast(AirTime as float),
  cast(Flights as int), cast(Distance as float),
  cast(CarrierDelay as float), cast(WeatherDelay as float), 
  cast(NASDelay as float),
  cast(SecurityDelay as float), 
  cast(LateAircraftDelay as float),
  CRSDepTime, CRSArrTime
FROM
  on_time_performance
""")

# sql로 읽어온 테이블 데이터를 다시 테이블로 등록한다.
trimmed_cast_performance.registerTempTable("on_time_performance")
trimmed_cast_performance.limit(10).head()

Row(Year='2015', Quarter='1', Month='1', DayofMonth='1', DayOfWeek='4', FlightDate='2015-01-01', Carrier='AA', TailNum='N001AA', FlightNum='1519', Origin='DFW', OriginCityName='Dallas/Fort Worth, TX', OriginState='TX', Dest='MEM', DestCityName='Memphis, TN', DestState='TN', DepTime='1342', DepDelay=-3.0, DepDelayMinutes=0, TaxiOut=16.0, TaxiIn=7.0, WheelsOff='1358', WheelsOn='1457', ArrTime='1504', ArrDelay=-6.0, ArrDelayMinutes=0.0, Cancelled=0, Diverted=0, ActualElapsedTime=82.0, AirTime=59.0, Flights=1, Distance=432.0, CarrierDelay=None, WeatherDelay=None, NASDelay=None, SecurityDelay=None, LateAircraftDelay=None, CRSDepTime='1345', CRSArrTime='1510')

In [4]:
# 데이터 크기 줄이기 위해 1달간 데이터 추출
january_data = spark.sql("""
   SELECT * FROM on_time_performance WHERE FlightDate >= "2015-01-01" AND FlightDate <= "2015-01-31"
""")
january_data.show(3)

+----+-------+-----+----------+---------+----------+-------+-------+---------+------+--------------------+-----------+----+--------------------+---------+-------+--------+---------------+-------+------+---------+--------+-------+--------+---------------+---------+--------+-----------------+-------+-------+--------+------------+------------+--------+-------------+-----------------+----------+----------+
|Year|Quarter|Month|DayofMonth|DayOfWeek|FlightDate|Carrier|TailNum|FlightNum|Origin|      OriginCityName|OriginState|Dest|        DestCityName|DestState|DepTime|DepDelay|DepDelayMinutes|TaxiOut|TaxiIn|WheelsOff|WheelsOn|ArrTime|ArrDelay|ArrDelayMinutes|Cancelled|Diverted|ActualElapsedTime|AirTime|Flights|Distance|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|CRSDepTime|CRSArrTime|
+----+-------+-----+----------+---------+----------+-------+-------+---------+------+--------------------+-----------+----+--------------------+---------+-------+--------+---------------+-

In [None]:
# 크기 줄인 데이터를 jsonl으로 저장.
january_data.repartition(1).write.mode('overwrite').json("../data/stream_kinesis_godata.json")
os.system("cp ../data/stream_kinesis_godata.json/part* ../data/Raw_Data.jsonl") # 파일 하나로 합치기

# 원본 데이터 parquet로 만들기

In [5]:
# 모든 데이터를 가져온다.
all_data = spark.sql("""
   SELECT * FROM on_time_performance
""")
all_data.show(3)

+----+-------+-----+----------+---------+----------+-------+-------+---------+------+--------------------+-----------+----+--------------------+---------+-------+--------+---------------+-------+------+---------+--------+-------+--------+---------------+---------+--------+-----------------+-------+-------+--------+------------+------------+--------+-------------+-----------------+----------+----------+
|Year|Quarter|Month|DayofMonth|DayOfWeek|FlightDate|Carrier|TailNum|FlightNum|Origin|      OriginCityName|OriginState|Dest|        DestCityName|DestState|DepTime|DepDelay|DepDelayMinutes|TaxiOut|TaxiIn|WheelsOff|WheelsOn|ArrTime|ArrDelay|ArrDelayMinutes|Cancelled|Diverted|ActualElapsedTime|AirTime|Flights|Distance|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|CRSDepTime|CRSArrTime|
+----+-------+-----+----------+---------+----------+-------+-------+---------+------+--------------------+-----------+----+--------------------+---------+-------+--------+---------------+-

In [6]:
# parquet으로 저장.
all_data.repartition(1).write.mode('overwrite').parquet("../../02_Data_Batch_Processing/data/Raw_Data.parquet")