# Pre-processing

In [1]:
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import org.apache.spark.sql.Encoders

import java.io.File
import scala.collection.mutable.ListBuffer
import java.lang.Math._

In [2]:
val spark = SparkSession.builder.master("local[*]").appName("SparkPreProcessing").getOrCreate()
import spark.implicits._

spark = org.apache.spark.sql.SparkSession@670a2f78


org.apache.spark.sql.SparkSession@670a2f78

In [3]:
// http://alvinalexander.com/scala/how-to-list-files-in-directory-filter-names-scala/

def getListOfFiles(dir: String):List[File] = {
    val d = new File(dir)
    if (d.exists && d.isDirectory) {
        d.listFiles.filter(_.isFile).toList
    } else {
        List[File]()
    }
}

getListOfFiles: (dir: String)List[java.io.File]


In [4]:
val folder = "./2019/"
val csvFiles = getListOfFiles(folder)

folder = ./2019/
csvFiles = List(./2019/201902-citibike-tripdata.csv, ./2019/201911-citibike-tripdata.csv, ./2019/201903-citibike-tripdata.csv, ./2019/201908-citibike-tripdata.csv, ./2019/201912-citibike-tripdata.csv, ./2019/201906-citibike-tripdata.csv, ./2019/201901-citibike-tripdata.csv, ./2019/201909-citibike-tripdata.csv, ./2019/201904-citibike-tripdata.csv, ./2019/201907-citibike-tripdata.csv, ./2019/201905-citibike-tripdata.csv, ./2019/201910-citibike-tripdata.csv)


List(./2019/201902-citibike-tripdata.csv, ./2019/201911-citibike-tripdata.csv, ./2019/201903-citibike-tripdata.csv, ./2019/201908-citibike-tripdata.csv, ./2019/201912-citibike-tripdata.csv, ./2019/201906-citibike-tripdata.csv, ./2019/201901-citibike-tripdata.csv, ./2019/201909-citibike-tripdata.csv, ./2019/201904-citibike-tripdata.csv, ./2019/201907-citibike-tripdata.csv, ./2019/201905-citibike-tripdata.csv, ./2019/201910-citibike-tripdata.csv)

## Desired dataset

In [19]:
val year = csvFiles(0).getName().split("-")(0).take(4)
val month = csvFiles(0).getName().split("-")(0).takeRight(2)
val t0 = s"${year}-${month}-01 00:00:00.0000"

year = 2019
month = 02
t0 = 2019-02-01 00:00:00.0000


2019-02-01 00:00:00.0000

We want a simple dataset with the following structure:
- latitude
- longitude
- year
- month
- day
- hour
- min
- (weather)
- number of bikes available (target)

It will be the input of our learning algorithm. Then, we also have to store some informations about the stations, in another dataset, with the following structure:
- station_id
- name
- latitude
- longitude

## Generating a single .csv for the stations

This .csv consists of four columns:
- id
- name
- latitude
- longitude

We get all stations from all .csv files within the folder defined above.

In [116]:
val newColumns = Seq("id",
                     "name",
                     "latitude",
                     "longitude"
                    )

val oldStartColumns = Seq("start station id",
                          "start station name",
                          "start station latitude",
                          "start station longitude"
                         )

val oldEndColumns = Seq("end station id",
                        "end station name",
                        "end station latitude",
                        "end station longitude"
                       )

val columnsStartList = oldStartColumns.zip(newColumns).map(f => { col(f._1).as(f._2) })

val columnsEndList = oldEndColumns.zip(newColumns).map(f => { col(f._1).as(f._2) })

newColumns = List(id, name, latitude, longitude)
oldStartColumns = List(start station id, start station name, start station latitude, start station longitude)
oldEndColumns = List(end station id, end station name, end station latitude, end station longitude)
columnsStartList = List(start station id AS `id`, start station name AS `name`, start station latitude AS `latitude`, start station longitude AS `longitude`)
columnsEndList = List(end station id AS `id`, end station name AS `name`, end station latitude AS `latitude`, end station longitude AS `longitude`)


List(end station id AS `id`, end station name AS `name`, end station latitude AS `latitude`, end station longitude AS `longitude`)

In [127]:
var stationsDf = Seq.empty[(String,String,String,String)]
                    .toDF(newColumns:_*)

for (csvFile <- csvFiles) {
    val df = spark.read
                  .option("header", "true")
                  .csv(csvFile.getPath())

    val startStations = df.select(df("start station id"),
                                  df("start station name"),
                                  df("start station latitude"),
                                  df("start station longitude")
                                 )
                           .distinct()
                           .select(columnsStartList:_*)
    
    val endStations = df.select(df("end station id"),
                                df("end station name"),
                                df("end station latitude"),
                                df("end station longitude")
                               )
                       .distinct()
                       .select(columnsEndList:_*)
    
    stationsDf = stationsDf.union(startStations)
                           .union(endStations)
                           .distinct()
}

stationsDf = [id: string, name: string ... 2 more fields]


[id: string, name: string ... 2 more fields]

In [41]:
stationsDf = stationsDf.dropDuplicates("id")

stationsDf = [id: string, name: string ... 2 more fields]


[id: string, name: string ... 2 more fields]

In [42]:
stationsDf.repartition(1)
          .write
          .format("com.databricks.spark.csv")
          .option("header", "true")
          .save("stations")

In [5]:
// Only after running the last cells and renaming the output as "stations.csv"

val stationsDf = spark.read
                      .option("header", "true")
                      .csv("outputs/stations.csv")

stationsDf = [id: string, name: string ... 2 more fields]


[id: string, name: string ... 2 more fields]

In [6]:
stationsDf.count()

1107

## Getting the number of bikes available at the beginning

To determine the number of bikes available at the beginning, we just have to select the first start station of each bike and add +1 to the initial number of bikes available in this station.

In [5]:
// Let's test with a single .csv

val df = spark.read
              .option("header", "true")
              .csv("./2019/201901-citibike-tripdata.csv")

df = [tripduration: string, starttime: string ... 13 more fields]


[tripduration: string, starttime: string ... 13 more fields]

In [6]:
df.printSchema()

root
 |-- tripduration: string (nullable = true)
 |-- starttime: string (nullable = true)
 |-- stoptime: string (nullable = true)
 |-- start station id: string (nullable = true)
 |-- start station name: string (nullable = true)
 |-- start station latitude: string (nullable = true)
 |-- start station longitude: string (nullable = true)
 |-- end station id: string (nullable = true)
 |-- end station name: string (nullable = true)
 |-- end station latitude: string (nullable = true)
 |-- end station longitude: string (nullable = true)
 |-- bikeid: string (nullable = true)
 |-- usertype: string (nullable = true)
 |-- birth year: string (nullable = true)
 |-- gender: string (nullable = true)



In [7]:
val newDf = df.select(df("starttime").as("time"),
                      df("start station id").as("station_id"),
                      df("bikeid").as("bike_id"))

[time: string, station_id: string ... 1 more field]

newDf = [time: string, station_id: string ... 1 more field]


In [8]:
// https://stackoverflow.com/questions/33878370/how-to-select-the-first-row-of-each-group

val bikeIdStationIdPairs = newDf.orderBy("time")
                                .groupBy("bike_id")
                                .agg(first("station_id").alias("station_id"))
                                .orderBy(asc("station_id"))
                                .filter("bike_id != 'NULL'")
                                .groupBy("station_id")
                                .agg(count("*").as("one"))
                                .cache()

bikeIdStationIdPairs = [station_id: string, one: bigint]


[station_id: string, one: bigint]

## Final pre-processing

### Step by step, with only one input file

In [9]:
val t0 = "2019-01-01 00:00:00.0000"
var finalDf = bikeIdStationIdPairs.withColumn("time", lit(t0))

t0 = 2019-01-01 00:00:00.0000
finalDf = [station_id: string, one: bigint ... 1 more field]


[station_id: string, one: bigint ... 1 more field]

In [10]:
finalDf = finalDf.select(finalDf("station_id"),
                         finalDf("time"),
                         finalDf("one"))

finalDf = [station_id: string, time: string ... 1 more field]


[station_id: string, time: string ... 1 more field]

In [11]:
val startDf = df.select(df("start station id").as("station_id"),
                       df("starttime").as("time"))
               .withColumn("one", lit(-1))

val endDf = df.select(df("end station id").as("station_id"),
                        df("stoptime").as("time"))
                .withColumn("one", lit(1))

val dataDf = startDf.union(endDf).sort($"time".asc)

startDf = [station_id: string, time: string ... 1 more field]
endDf = [station_id: string, time: string ... 1 more field]
dataDf = [station_id: string, time: string ... 1 more field]


[station_id: string, time: string ... 1 more field]

In [12]:
// finalDf.show(10)

Syntax Error.: 

In [13]:
// dataDf.show(10)

Syntax Error.: 

In [14]:
finalDf = finalDf.union(dataDf).sort($"time").cache()

finalDf = [station_id: string, time: string ... 1 more field]


[station_id: string, time: string ... 1 more field]

In [15]:
finalDf = finalDf.withColumn("nb_bikes_available", sum("one").over(
    Window.partitionBy("station_id")
          .orderBy(col("time").asc)
    )
)

finalDf = [station_id: string, time: string ... 2 more fields]


[station_id: string, time: string ... 2 more fields]

In [16]:
// finalDf.show(10, false)

Syntax Error.: 

In [17]:
finalDf = finalDf.select(finalDf("station_id"),
                         finalDf("time"),
                         finalDf("nb_bikes_available"))

finalDf = [station_id: string, time: string ... 1 more field]


[station_id: string, time: string ... 1 more field]

In [18]:
// finalDf.show(10, false)

Syntax Error.: 

In [19]:
val stationsDf = spark.read
                      .option("header", "true")
                      .csv("outputs/stations.csv")
                      .cache()

stationsDf = [id: string, name: string ... 2 more fields]


[id: string, name: string ... 2 more fields]

In [20]:
// stationsDf.show(10)

Syntax Error.: 

In [21]:
finalDf = finalDf.join(stationsDf,
                       finalDf("station_id") ===  stationsDf("id"),
                       "inner")

finalDf = [station_id: string, time: string ... 5 more fields]


[station_id: string, time: string ... 5 more fields]

In [22]:
finalDf.show(10)

+----------+--------------------+------------------+---+--------------------+-----------+-----------+
|station_id|                time|nb_bikes_available| id|                name|   latitude|  longitude|
+----------+--------------------+------------------+---+--------------------+-----------+-----------+
|       296|2019-01-01 00:00:...|                29|296|Division St & Bowery|40.71413089|-73.9970468|
|       296|2019-01-01 01:01:...|                28|296|Division St & Bowery|40.71413089|-73.9970468|
|       296|2019-01-01 01:15:...|                29|296|Division St & Bowery|40.71413089|-73.9970468|
|       296|2019-01-01 02:25:...|                30|296|Division St & Bowery|40.71413089|-73.9970468|
|       296|2019-01-01 03:26:...|                31|296|Division St & Bowery|40.71413089|-73.9970468|
|       296|2019-01-01 06:13:...|                32|296|Division St & Bowery|40.71413089|-73.9970468|
|       296|2019-01-01 06:30:...|                31|296|Division St & Bowery|40.71

In [23]:
finalDf = finalDf.withColumn("_tmp", split($"time", "\\ "))
                 .select(finalDf("latitude"),
                         finalDf("longitude"),
                         $"_tmp".getItem(0).as("date"),
                         $"_tmp".getItem(1).as("time"),
                         finalDf("nb_bikes_available"))

finalDf = [latitude: string, longitude: string ... 3 more fields]


[latitude: string, longitude: string ... 3 more fields]

In [24]:
finalDf.show(10)

+-----------+-----------+----------+-------------+------------------+
|   latitude|  longitude|      date|         time|nb_bikes_available|
+-----------+-----------+----------+-------------+------------------+
|40.71413089|-73.9970468|2019-01-01|00:00:00.0000|                29|
|40.71413089|-73.9970468|2019-01-01|01:01:13.0820|                28|
|40.71413089|-73.9970468|2019-01-01|01:15:00.9000|                29|
|40.71413089|-73.9970468|2019-01-01|02:25:48.7100|                30|
|40.71413089|-73.9970468|2019-01-01|03:26:20.0670|                31|
|40.71413089|-73.9970468|2019-01-01|06:13:17.5790|                32|
|40.71413089|-73.9970468|2019-01-01|06:30:54.8570|                31|
|40.71413089|-73.9970468|2019-01-01|07:44:13.5660|                32|
|40.71413089|-73.9970468|2019-01-01|08:11:59.0560|                31|
|40.71413089|-73.9970468|2019-01-01|08:16:00.5430|                30|
+-----------+-----------+----------+-------------+------------------+
only showing top 10 

In [25]:
finalDf = finalDf.withColumn("_tmp", split($"date", "\\-"))
                 .withColumn("_tmp2", split($"time", "\\:"))
                 .select(finalDf("latitude"),
                         finalDf("longitude"),
                         $"_tmp".getItem(0).as("year"),
                         $"_tmp".getItem(1).as("month"),
                         $"_tmp".getItem(2).as("day"),
                         $"_tmp2".getItem(0).as("hour"),
                         $"_tmp2".getItem(1).as("min"),
                         finalDf("nb_bikes_available"))

finalDf = [latitude: string, longitude: string ... 6 more fields]


[latitude: string, longitude: string ... 6 more fields]

In [26]:
finalDf.show(10)

+-----------+-----------+----+-----+---+----+---+------------------+
|   latitude|  longitude|year|month|day|hour|min|nb_bikes_available|
+-----------+-----------+----+-----+---+----+---+------------------+
|40.71413089|-73.9970468|2019|   01| 01|  00| 00|                29|
|40.71413089|-73.9970468|2019|   01| 01|  01| 01|                28|
|40.71413089|-73.9970468|2019|   01| 01|  01| 15|                29|
|40.71413089|-73.9970468|2019|   01| 01|  02| 25|                30|
|40.71413089|-73.9970468|2019|   01| 01|  03| 26|                31|
|40.71413089|-73.9970468|2019|   01| 01|  06| 13|                32|
|40.71413089|-73.9970468|2019|   01| 01|  06| 30|                31|
|40.71413089|-73.9970468|2019|   01| 01|  07| 44|                32|
|40.71413089|-73.9970468|2019|   01| 01|  08| 11|                31|
|40.71413089|-73.9970468|2019|   01| 01|  08| 16|                30|
+-----------+-----------+----+-----+---+----+---+------------------+
only showing top 10 rows



In [None]:
finalDf.repartition(1)
       .write
       .format("com.databricks.spark.csv")
       .option("header", "true")
       .save("outputs/test")

### To run only once

In [5]:
val stationsDf = spark.read
                      .option("header", "true")
                      .csv("outputs/stations.csv")

for (csvFile <- csvFiles) {
    
    val year = csvFile.getName().split("-")(0).take(4)
    val month = csvFile.getName().split("-")(0).takeRight(2)
    val t0 = s"${year}-${month}-01 00:00:00.0000"
    
    // First: we get initial nb of bikes
    
    val df = spark.read
                  .option("header", "true")
                  .csv(csvFile.getPath())
    
    val newDf = df.select(df("starttime").as("time"),
                          df("start station id").as("station_id"),
                          df("bikeid").as("bike_id"))

    val bikeIdStationIdPairs = newDf.orderBy("time")
                                    .groupBy("bike_id")
                                    .agg(first("station_id").alias("station_id"))
                                    .orderBy(asc("station_id"))
                                    .filter("bike_id != 'NULL'")
                                    .groupBy("station_id")
                                    .agg(count("*").as("one"))
                                    .cache()
    
    // Then: we pre-process and join
    
    var finalDf = bikeIdStationIdPairs.withColumn("time", lit(t0))
    
    finalDf = finalDf.select(finalDf("station_id"),
                             finalDf("time"),
                             finalDf("one"))
    
    val startDf = df.select(df("start station id").as("station_id"),
                            df("starttime").as("time"))
                    .withColumn("one", lit(-1))

    val endDf = df.select(df("end station id").as("station_id"),
                          df("stoptime").as("time"))
                  .withColumn("one", lit(1))

    val dataDf = startDf.union(endDf).sort($"time".asc).cache()
    
    finalDf = finalDf.union(dataDf).sort($"time")
    
    finalDf = finalDf.withColumn("nb_bikes_available",
                                 sum("one").over(Window.partitionBy("station_id")
                                           .orderBy(col("time").asc))
                                )
    
    finalDf = finalDf.select(finalDf("station_id"),
                         finalDf("time"),
                         finalDf("nb_bikes_available"))
    
    finalDf = finalDf.join(stationsDf,
                       finalDf("station_id") ===  stationsDf("id"),
                       "inner")

    finalDf = finalDf.withColumn("_tmp", split($"time", "\\ "))
                     .select(finalDf("latitude"),
                             finalDf("longitude"),
                             $"_tmp".getItem(0).as("date"),
                             $"_tmp".getItem(1).as("time"),
                             finalDf("nb_bikes_available"))
    
    finalDf = finalDf.withColumn("_tmp", split($"date", "\\-"))
                     .withColumn("_tmp2", split($"time", "\\:"))
                     .select(finalDf("latitude"),
                             finalDf("longitude"),
                             $"_tmp".getItem(0).as("year"),
                             $"_tmp".getItem(1).as("month"),
                             $"_tmp".getItem(2).as("day"),
                             $"_tmp2".getItem(0).as("hour"),
                             $"_tmp2".getItem(1).as("min"),
                             finalDf("nb_bikes_available"))
    
    finalDf.repartition(1)
              .write
              .format("com.databricks.spark.csv")
              .option("header", "true")
              .save(s"outputs/${csvFile.getName()}")
}

stationsDf = [id: string, name: string ... 2 more fields]


[id: string, name: string ... 2 more fields]

## Dealing with geospatial representations

Reference: https://heartbeat.fritz.ai/working-with-geospatial-data-in-machine-learning-ad4097c7228d

In [80]:
val folder = "./outputs/dataset/Lat_Long"
val csvFiles = getListOfFiles(folder)

folder = ./outputs/dataset/Lat_Long
csvFiles = List(./outputs/dataset/Lat_Long/2019-09.csv, ./outputs/dataset/Lat_Long/2019-08.csv, ./outputs/dataset/Lat_Long/2019-04.csv, ./outputs/dataset/Lat_Long/2019-07.csv, ./outputs/dataset/Lat_Long/2019-11.csv, ./outputs/dataset/Lat_Long/2019-03.csv, ./outputs/dataset/Lat_Long/2019-06.csv, ./outputs/dataset/Lat_Long/2019-05.csv, ./outputs/dataset/Lat_Long/2019-12.csv, ./outputs/dataset/Lat_Long/2019-10.csv, ./outputs/dataset/Lat_Long/2019-01.csv, ./outputs/dataset/Lat_Long/2019-02.csv)


lastException: Throwable = null


List(./outputs/dataset/Lat_Long/2019-09.csv, ./outputs/dataset/Lat_Long/2019-08.csv, ./outputs/dataset/Lat_Long/2019-04.csv, ./outputs/dataset/Lat_Long/2019-07.csv, ./outputs/dataset/Lat_Long/2019-11.csv, ./outputs/dataset/Lat_Long/2019-03.csv, ./outputs/dataset/Lat_Long/2019-06.csv, ./outputs/dataset/Lat_Long/2019-05.csv, ./outputs/dataset/Lat_Long/2019-12.csv, ./outputs/dataset/Lat_Long/2019-10.csv, ./outputs/dataset/Lat_Long/2019-01.csv, ./outputs/dataset/Lat_Long/2019-02.csv)

In [25]:
def computeXCoo(latitude: Long, longitude: Long): Double = {
    return cos(latitude) * cos(longitude)
}

def computeYCoo(latitude: Long, longitude: Long): Double = {
    return cos(latitude) * sin(longitude)
}

def computeZCoo(latitude: Long): Double = {
    return sin(latitude)
}

val computeXCooUdf = udf(computeXCoo _)
val computeYCooUdf = udf(computeYCoo _)
val computeZCooUdf = udf(computeZCoo _)

computeXCooUdf = UserDefinedFunction(<function2>,DoubleType,Some(List(LongType, LongType)))
computeYCooUdf = UserDefinedFunction(<function2>,DoubleType,Some(List(LongType, LongType)))
computeZCooUdf = UserDefinedFunction(<function1>,DoubleType,Some(List(LongType)))


lastException: Throwable = null
computeXCoo: (latitude: Long, longitude: Long)Double
computeYCoo: (latitude: Long, longitude: Long)Double
computeZCoo: (latitude: Long)Double


UserDefinedFunction(<function1>,DoubleType,Some(List(LongType)))

### Step by step

In [61]:
val df = spark.read
              .option("header", "true")
              .csv(csvFiles(0).getPath())

df = [latitude: string, longitude: string ... 6 more fields]


lastException: Throwable = null


[latitude: string, longitude: string ... 6 more fields]

In [62]:
df.printSchema()

root
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- hour: string (nullable = true)
 |-- min: string (nullable = true)
 |-- nb_bikes_available: string (nullable = true)



In [63]:
df.show(10)

+-----------+-----------+----+-----+---+----+---+------------------+
|   latitude|  longitude|year|month|day|hour|min|nb_bikes_available|
+-----------+-----------+----+-----+---+----+---+------------------+
|40.71413089|-73.9970468|2019|   01| 01|  00| 00|                31|
|40.71413089|-73.9970468|2019|   09| 01|  00| 41|                30|
|40.71413089|-73.9970468|2019|   09| 01|  00| 59|                31|
|40.71413089|-73.9970468|2019|   09| 01|  01| 05|                30|
|40.71413089|-73.9970468|2019|   09| 01|  01| 15|                31|
|40.71413089|-73.9970468|2019|   09| 01|  02| 13|                30|
|40.71413089|-73.9970468|2019|   09| 01|  02| 13|                29|
|40.71413089|-73.9970468|2019|   09| 01|  02| 36|                30|
|40.71413089|-73.9970468|2019|   09| 01|  02| 48|                29|
|40.71413089|-73.9970468|2019|   09| 01|  02| 55|                28|
+-----------+-----------+----+-----+---+----+---+------------------+
only showing top 10 rows



In [75]:
// https://stackoverflow.com/questions/35227568/applying-function-to-spark-dataframe-column

val newDf = df.withColumn("x_coo", computeXCooUdf($"latitude", $"longitude"))
              .withColumn("y_coo", computeYCooUdf($"latitude", $"longitude"))
              .withColumn("z_coo", computeZCooUdf($"latitude"))
              .select($"x_coo",
                      $"y_coo",
                      $"z_coo",
                      df("year"),
                      df("month"),
                      df("day"),
                      df("hour"),
                      df("min"),
                      df("nb_bikes_available"))

newDf = [x_coo: double, y_coo: double ... 7 more fields]


[x_coo: double, y_coo: double ... 7 more fields]

In [76]:
newDf.show(10)

+------------------+-------------------+------------------+----+-----+---+----+---+------------------+
|             x_coo|              y_coo|             z_coo|year|month|day|hour|min|nb_bikes_available|
+------------------+-------------------+------------------+----+-----+---+----+---+------------------+
|0.4909949444970359|-0.4513649771070291|0.7451131604793488|2019|   01| 01|  00| 00|                31|
|0.4909949444970359|-0.4513649771070291|0.7451131604793488|2019|   09| 01|  00| 41|                30|
|0.4909949444970359|-0.4513649771070291|0.7451131604793488|2019|   09| 01|  00| 59|                31|
|0.4909949444970359|-0.4513649771070291|0.7451131604793488|2019|   09| 01|  01| 05|                30|
|0.4909949444970359|-0.4513649771070291|0.7451131604793488|2019|   09| 01|  01| 15|                31|
|0.4909949444970359|-0.4513649771070291|0.7451131604793488|2019|   09| 01|  02| 13|                30|
|0.4909949444970359|-0.4513649771070291|0.7451131604793488|2019|   09| 01

### To run only once

In [81]:
for (csvFile <- csvFiles) {
    val df = spark.read
                  .option("header", "true")
                  .csv(csvFile.getPath())

    val newDf = df.withColumn("x_coo", computeXCooUdf($"latitude", $"longitude"))
                  .withColumn("y_coo", computeYCooUdf($"latitude", $"longitude"))
                  .withColumn("z_coo", computeZCooUdf($"latitude"))
                  .select($"x_coo",
                          $"y_coo",
                          $"z_coo",
                          df("year"),
                          df("month"),
                          df("day"),
                          df("hour"),
                          df("min"),
                          df("nb_bikes_available"))
    
    newDf.repartition(1)
         .write
         .format("com.databricks.spark.csv")
         .option("header", "true")
         .save(s"outputs/${csvFile.getName()}")
}