In [14]:
import glob
import json
import os
import urllib
import requests
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, TimestampType, DoubleType, ShortType

In [15]:
# Set up modes and dirs
overwrite  = False
databricks = False
if not databricks:
    data_dir = '../data'
    spark = SparkSession.builder.getOrCreate()
else:
    data_dir = "/dbfs/mnt/group01"

foil_dir = foil_dbfs = os.path.join(data_dir, "foil")
down_dir = down_dbfs = os.path.join(foil_dir, "down")
csvs_dir = csvs_dbfs = os.path.join(foil_dir, "csv")
zips_dir = zips_dbfs = os.path.join(foil_dir, "zip")
raws_dir = raws_dbfs = os.path.join(foil_dir, "raw")

if databricks:
    foil_dbfs = foil_dbfs.replace("/dbfs", "")
    down_dbfs = down_dbfs.replace("/dbfs", "")
    csvs_dbfs = csvs_dbfs.replace("/dbfs", "")
    zips_dbfs = zips_dbfs.replace("/dbfs", "")
    raws_dbfs = raws_dbfs.replace("/dbfs", "")
dirs = [data_dir, foil_dir, down_dir, csvs_dir, zips_dir]

In [16]:
for d in dirs:
    if not os.path.exists(d):
        os.makedirs(d)

In [17]:
year_start = 2010
year_end   = 2013
moth_start = 1
moth_end   = 12

In [18]:
def check_file_exist(_path):
    if os.path.exists(_path) and not overwrite:
            print("[SYSTEM]: File exists: {}".format(_path))
            return True
    else:
        return False

In [19]:
def download_foil_data():
    info_title = lambda _file_name : print("____________________________FOIL_DOWNLOAD_{}____________________________".format(_file_name))
    info_start = lambda _file_name : print("[SYSTEM]: Start  {}".format(_file_name))
    info_end   = lambda _file_name : print("[SYSTEM]: Finish {}".format(_file_name))

    # Start to download the FOIL files
    data_page = "https://databank.illinois.edu/datasets/IDB-9610843"
    down_page = "https://databank.illinois.edu/datafiles/{}/download"

    requ = requests.get(data_page)
    resp = json.loads(requ.text)

    for datafile in resp['datafiles']:
        # databricks storage location
        # remote download location
        local_path = os.path.join(down_dir, datafile["binary_name"])
        remot_path = down_page.format(datafile["web_id"])

        info_title(datafile["binary_name"])
        if check_file_exist(local_path):
            continue
        # start to download the FOIL data
        info_start(local_path)
        urllib.request.urlretrieve(remot_path, local_path)
        info_end(local_path)

In [20]:
download_foil_data()

____________________________FOIL_DOWNLOAD_decompress.py____________________________
[SYSTEM]: File exists: ../data/foil/down/decompress.py
____________________________FOIL_DOWNLOAD_FOIL2012.zip____________________________
[SYSTEM]: File exists: ../data/foil/down/FOIL2012.zip
____________________________FOIL_DOWNLOAD_FOIL2011.zip____________________________
[SYSTEM]: File exists: ../data/foil/down/FOIL2011.zip
____________________________FOIL_DOWNLOAD_New_York_City_Taxi_Data_2010-2013.pdf____________________________
[SYSTEM]: File exists: ../data/foil/down/New_York_City_Taxi_Data_2010-2013.pdf
____________________________FOIL_DOWNLOAD_FOIL2010.zip____________________________
[SYSTEM]: File exists: ../data/foil/down/FOIL2010.zip
____________________________FOIL_DOWNLOAD_FOIL2013.zip____________________________
[SYSTEM]: File exists: ../data/foil/down/FOIL2013.zip


In [21]:
def extract_foil_down():
    info_title = lambda _file_name : print("____________________________FOIL_EXTRACT_{}____________________________".format(_file_name))
    info_start = lambda _file_name : print("[SYSTEM]: Start  {}".format(_file_name))
    info_end   = lambda _file_name : print("[SYSTEM]: Finish {}".format(_file_name))
    zip_files = glob.glob(os.path.join(down_dir, "*.zip"))
    # zip_files = glob.glob(os.path.join(down_dir, "FOIL2011.zip"))
    for zip_file in zip_files:
        target_folder = os.path.join(zips_dir, zip_file.replace(down_dir + "/", "").replace(".zip", ""))
        if check_file_exist(target_folder):
            continue
        info_title(zip_file.replace(down_dir + "/", ""))
        command = "cd {} && jar -xvf {}".format(zips_dir, zip_file)
        info_start(zip_file.replace(down_dir + "/", ""))
        os.system(command)
        info_end(zip_file.replace(down_dir + "/", ""))

In [22]:
extract_foil_down()

____________________________FOIL_EXTRACT_FOIL2012.zip____________________________
[SYSTEM]: Start  FOIL2012.zip
[SYSTEM]: Finish FOIL2012.zip
____________________________FOIL_EXTRACT_FOIL2013.zip____________________________
[SYSTEM]: Start  FOIL2013.zip
[SYSTEM]: Finish FOIL2013.zip
____________________________FOIL_EXTRACT_FOIL2011.zip____________________________
[SYSTEM]: Start  FOIL2011.zip
[SYSTEM]: Finish FOIL2011.zip
____________________________FOIL_EXTRACT_FOIL2010.zip____________________________
[SYSTEM]: Start  FOIL2010.zip
[SYSTEM]: Finish FOIL2010.zip


In [23]:
def extract_foil_zip():
    info_title = lambda _file_name : print("____________________________FOIL_ZIP_EXTRACT_{}____________________________".format(_file_name))
    info_start = lambda _file_name : print("[SYSTEM]: Start {}".format(_file_name))
    info_end   = lambda _file_name : print("[SYSTEM]: Finish {}".format(_file_name))

    for _year in range(year_start, year_end + 1):
        zip_foil = os.path.join(zips_dir, "FOIL{}".format(_year))
        zip_files = glob.glob(os.path.join(zip_foil, "*.zip"))
        tgt_foil = os.path.join(csvs_dir, "{}".format(_year))

        info_title(_year)
        for zip_file in zip_files:
            if not os.path.exists(tgt_foil):
                os.makedirs(tgt_foil)

            command = "cd {} && jar -xvf {}".format(tgt_foil, zip_file)
            tar_file = zip_file\
                .replace(zips_dir + "/", "")\
                .replace("FOIL{}/".format(_year), "")
            if check_file_exist(os.path.join(tgt_foil, tar_file).replace("zip", "csv")):
                continue
            info_start(tar_file)
            os.system(command)
            info_end(tar_file)

In [24]:
extract_foil_zip()

____________________________FOIL_ZIP_EXTRACT_2010____________________________
____________________________FOIL_ZIP_EXTRACT_2011____________________________
____________________________FOIL_ZIP_EXTRACT_2012____________________________
____________________________FOIL_ZIP_EXTRACT_2013____________________________


In [25]:
def process_trip(_in_df):
    return _in_df\
        .withColumn("medallion", col("medallion").cast(IntegerType()))\
        .withColumn("hack_license", col(" hack_license").cast(IntegerType()))\
        .drop(" hack_license")\
        .withColumn("pickup_datetime", col(" pickup_datetime").cast(TimestampType()))\
        .drop(" pickup_datetime")\
        .withColumn("dropoff_latitude", col(" dropoff_latitude").cast(DoubleType()))\
        .drop(" dropoff_latitude")\
        .withColumn("dropoff_longitude", col(" dropoff_longitude").cast(DoubleType()))\
        .drop(" dropoff_longitude")\
        .withColumn("pickup_latitude", col(" pickup_latitude").cast(DoubleType()))\
        .drop(" pickup_latitude")\
        .withColumn("pickup_longitude", col(" pickup_longitude").cast(DoubleType()))\
        .drop(" pickup_longitude")\
        .withColumn("trip_distance", col(" trip_distance").cast(DoubleType()))\
        .drop(" trip_distance")\
        .withColumn("trip_time_in_secs", col(" trip_time_in_secs").cast(IntegerType()))\
        .drop(" trip_time_in_secs")\
        .withColumn("dropoff_datetime", col(" dropoff_datetime").cast(TimestampType()))\
        .drop(" dropoff_datetime")\
        .withColumn("rate_code", col(" rate_code").cast(ShortType()))\
        .drop(" rate_code")\
        .drop(" passenger_count")\
        .drop(" vendor_id")\
        .drop(" store_and_fwd_flag")

def process_fare(_in_df):
    return _in_df.withColumn("medallion", col("medallion").cast(IntegerType()))\
    .withColumn("hack_license", col(" hack_license").cast(IntegerType()))\
    .drop(" hack_license")\
    .withColumn("pickup_datetime", col(" pickup_datetime").cast(TimestampType()))\
    .drop(" pickup_datetime")\
    .withColumn("tip_amount", col(" tip_amount").cast(DoubleType()))\
    .drop(" tip_amount")\
    .withColumn("total_amount", col(" total_amount").cast(DoubleType()))\
    .drop(" total_amount")\
    .drop(" vendor_id")\
    .drop(" payment_type")\
    .drop(" surcharge")\
    .drop(" mta_tax")\
    .drop(" tolls_amount")\
    .drop(" fare_amount")

def combine_raw_data():
    info_title = lambda _y, _m : print("____________________________FOIL_COMBINE_{}_{}____________________________".format(_y, _m))
    info_start = lambda _y, _m : print("[SYSTEM]: Start  {}-{}".format(_y, _m))
    info_end   = lambda _y, _m : print("[SYSTEM]: Finish {}-{}".format(_y, _m))
    for _year in range(year_start, year_end + 1):
        for _month in range(moth_start, moth_end + 1):
            info_title(_year, _month)
            tar_file = os.path.join(raws_dir , "{}/{}.gz.parquet".format(_year, _month))
            tar_dbfs = os.path.join(raws_dbfs, "{}/{}.gz.parquet".format(_year, _month))
            if check_file_exist(tar_file):
                continue
            fare_dbfs = os.path.join(csvs_dbfs, "{}/trip_fare_{}.csv".format(_year, _month))
            trip_dbfs = os.path.join(csvs_dbfs, "{}/trip_data_{}.csv".format(_year, _month))

            info_start(_year, _month)

            _fare_df = spark.read.option("header", True).csv(fare_dbfs)
            _trip_df = spark.read.option("header", True).csv(trip_dbfs)

            _fare_df = process_fare(_fare_df)
            _trip_df = process_trip(_trip_df)

            rs_df = _trip_df.join(_fare_df, ["medallion", "hack_license", "pickup_datetime"])
            rs_df.repartition(200)\
                .write.mode("overwrite")\
                .option("compression", "gzip")\
                .parquet(tar_dbfs)
            info_end(_year, _month)

In [26]:
combine_raw_data()

____________________________FOIL_COMBINE_2010_1____________________________
[SYSTEM]: File exists: ../data/foil/raw/2010/1.gz.parquet
____________________________FOIL_COMBINE_2010_2____________________________
[SYSTEM]: File exists: ../data/foil/raw/2010/2.gz.parquet
____________________________FOIL_COMBINE_2010_3____________________________
[SYSTEM]: File exists: ../data/foil/raw/2010/3.gz.parquet
____________________________FOIL_COMBINE_2010_4____________________________
[SYSTEM]: File exists: ../data/foil/raw/2010/4.gz.parquet
____________________________FOIL_COMBINE_2010_5____________________________
[SYSTEM]: File exists: ../data/foil/raw/2010/5.gz.parquet
____________________________FOIL_COMBINE_2010_6____________________________
[SYSTEM]: File exists: ../data/foil/raw/2010/6.gz.parquet
____________________________FOIL_COMBINE_2010_7____________________________
[SYSTEM]: Start  2010-7


Py4JJavaError: An error occurred while calling o337.parquet.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:226)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:178)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:108)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:106)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:131)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:122)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:121)
	at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:963)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:963)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:415)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:399)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:848)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 8 in stage 14.0 failed 1 times, most recent failure: Lost task 8.0 in stage 14.0 (TID 293, 192.168.1.15, executor driver): org.apache.spark.SparkException: Task failed while writing rows.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:291)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$15(FileFormatWriter.scala:205)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.hadoop.fs.FSError: java.io.IOException: No space left on device
	at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileOutputStream.write(RawLocalFileSystem.java:275)
	at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
	at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
	at org.apache.hadoop.fs.FSDataOutputStream$PositionCache.write(FSDataOutputStream.java:57)
	at java.io.DataOutputStream.write(DataOutputStream.java:107)
	at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSOutputSummer.writeChunk(ChecksumFileSystem.java:427)
	at org.apache.hadoop.fs.FSOutputSummer.writeChecksumChunks(FSOutputSummer.java:217)
	at org.apache.hadoop.fs.FSOutputSummer.write1(FSOutputSummer.java:125)
	at org.apache.hadoop.fs.FSOutputSummer.write(FSOutputSummer.java:111)
	at org.apache.hadoop.fs.FSDataOutputStream$PositionCache.write(FSDataOutputStream.java:57)
	at java.io.DataOutputStream.write(DataOutputStream.java:107)
	at org.apache.parquet.hadoop.util.HadoopPositionOutputStream.write(HadoopPositionOutputStream.java:50)
	at org.apache.parquet.bytes.BytesInput$ByteArrayBytesInput.writeAllTo(BytesInput.java:449)
	at org.apache.parquet.hadoop.ParquetFileWriter.writeDictionaryPage(ParquetFileWriter.java:346)
	at org.apache.parquet.hadoop.ColumnChunkPageWriteStore$ColumnChunkPageWriter.writeToFileWriter(ColumnChunkPageWriteStore.java:198)
	at org.apache.parquet.hadoop.ColumnChunkPageWriteStore.flushToFileWriter(ColumnChunkPageWriteStore.java:261)
	at org.apache.parquet.hadoop.InternalParquetRecordWriter.flushRowGroupToStore(InternalParquetRecordWriter.java:173)
	at org.apache.parquet.hadoop.InternalParquetRecordWriter.close(InternalParquetRecordWriter.java:114)
	at org.apache.parquet.hadoop.ParquetRecordWriter.close(ParquetRecordWriter.java:165)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.close(ParquetOutputWriter.scala:42)
	at org.apache.spark.sql.execution.datasources.FileFormatDataWriter.releaseResources(FileFormatDataWriter.scala:58)
	at org.apache.spark.sql.execution.datasources.FileFormatDataWriter.commit(FileFormatDataWriter.scala:75)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeTask$1(FileFormatWriter.scala:275)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1411)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:281)
	... 9 more
Caused by: java.io.IOException: No space left on device
	at java.io.FileOutputStream.writeBytes(Native Method)
	at java.io.FileOutputStream.write(FileOutputStream.java:326)
	at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileOutputStream.write(RawLocalFileSystem.java:273)
	... 33 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:195)
	... 33 more
Caused by: org.apache.spark.SparkException: Task failed while writing rows.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:291)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$15(FileFormatWriter.scala:205)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: org.apache.hadoop.fs.FSError: java.io.IOException: No space left on device
	at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileOutputStream.write(RawLocalFileSystem.java:275)
	at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
	at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
	at org.apache.hadoop.fs.FSDataOutputStream$PositionCache.write(FSDataOutputStream.java:57)
	at java.io.DataOutputStream.write(DataOutputStream.java:107)
	at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSOutputSummer.writeChunk(ChecksumFileSystem.java:427)
	at org.apache.hadoop.fs.FSOutputSummer.writeChecksumChunks(FSOutputSummer.java:217)
	at org.apache.hadoop.fs.FSOutputSummer.write1(FSOutputSummer.java:125)
	at org.apache.hadoop.fs.FSOutputSummer.write(FSOutputSummer.java:111)
	at org.apache.hadoop.fs.FSDataOutputStream$PositionCache.write(FSDataOutputStream.java:57)
	at java.io.DataOutputStream.write(DataOutputStream.java:107)
	at org.apache.parquet.hadoop.util.HadoopPositionOutputStream.write(HadoopPositionOutputStream.java:50)
	at org.apache.parquet.bytes.BytesInput$ByteArrayBytesInput.writeAllTo(BytesInput.java:449)
	at org.apache.parquet.hadoop.ParquetFileWriter.writeDictionaryPage(ParquetFileWriter.java:346)
	at org.apache.parquet.hadoop.ColumnChunkPageWriteStore$ColumnChunkPageWriter.writeToFileWriter(ColumnChunkPageWriteStore.java:198)
	at org.apache.parquet.hadoop.ColumnChunkPageWriteStore.flushToFileWriter(ColumnChunkPageWriteStore.java:261)
	at org.apache.parquet.hadoop.InternalParquetRecordWriter.flushRowGroupToStore(InternalParquetRecordWriter.java:173)
	at org.apache.parquet.hadoop.InternalParquetRecordWriter.close(InternalParquetRecordWriter.java:114)
	at org.apache.parquet.hadoop.ParquetRecordWriter.close(ParquetRecordWriter.java:165)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.close(ParquetOutputWriter.scala:42)
	at org.apache.spark.sql.execution.datasources.FileFormatDataWriter.releaseResources(FileFormatDataWriter.scala:58)
	at org.apache.spark.sql.execution.datasources.FileFormatDataWriter.commit(FileFormatDataWriter.scala:75)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeTask$1(FileFormatWriter.scala:275)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1411)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:281)
	... 9 more
Caused by: java.io.IOException: No space left on device
	at java.io.FileOutputStream.writeBytes(Native Method)
	at java.io.FileOutputStream.write(FileOutputStream.java:326)
	at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileOutputStream.write(RawLocalFileSystem.java:273)
	... 33 more
