In [1]:
from pyspark.sql import SparkSession

In [2]:
MAX_MEMORY = "5g"
spark = SparkSession.builder.appName("taxi-duration-prediction")\
            .config("spark.executor.memory", MAX_MEMORY)\
            .config("spark.driver.memory", MAX_MEMORY).getOrCreate()

In [3]:
trip_files = "/Users/keon/fastcampus/data-engineering/01-spark/data/trips/*"

In [4]:
trips_df = spark.read.csv(f"file:///{trip_files}", inferSchema=True, header=True)

In [5]:
trips_df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [6]:
trips_df.createOrReplaceTempView("trips")

In [46]:
query = """
SELECT
    trip_distance,
    total_amount
FROM
    (SELECT
        *,
        TO_DATE(t.tpep_pickup_datetime) AS pickup_date
    FROM
        trips t)
WHERE
    total_amount < 5000
    AND total_amount > 0
    AND trip_distance > 0
    AND trip_distance < 500
    AND passenger_count < 4
    AND pickup_date >= '2021-01-01'
    AND pickup_date < '2021-08-01'
"""
data_df = spark.sql(query)
data_df.createOrReplaceTempView("data")

In [41]:
## Play with data
query = """
SELECT 
    *,
    trip_distance / duration / 60 / 60 AS mph
FROM
    data
"""
res = spark.sql(query)

In [47]:
res.describe().show()

+-------+------------------+-----------------+--------------------+
|summary|     trip_distance|         duration|                 mph|
+-------+------------------+-----------------+--------------------+
|  count|          13125582|         13125582|            13125582|
|   mean| 2.882109632929013|903.7218041836164|1.049888189940366...|
| stddev|3.8203201017216077|3557.275401003693|7.961356651144816E-6|
|    min|              0.01|                1|3.250079301934967...|
|    max|             475.5|          1729062|0.005388888888888888|
+-------+------------------+-----------------+--------------------+



In [48]:
train_df, test_df = data_df.randomSplit([0.8, 0.2], seed=1)
print(train_df.count())
print(test_df.count())

10500134
2625906


In [49]:
from pyspark.ml.feature import VectorAssembler

In [50]:
vassembler = VectorAssembler(inputCols=['trip_distance'], outputCol='features')
vtrain_df = vassembler.transform(train_df)
vtrain_df.show()

+-------------+------------+--------+
|trip_distance|total_amount|features|
+-------------+------------+--------+
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
+-------------+------------+--------+
only showing top 20 rows



In [51]:
from pyspark.ml.regression import LinearRegression

In [53]:
lr = LinearRegression(
    maxIter=50,
    labelCol="total_amount",
    featuresCol="features",
    elasticNetParam=0.5, 
    regParam=0.01,
)

In [54]:
model = lr.fit(vtrain_df)

In [55]:
from pyspark.ml.evaluation import RegressionEvaluator

In [56]:
vtest_df = vassembler.transform(test_df)

In [57]:
predictions = model.transform(vtest_df).cache()

In [58]:
predictions.show()

+-------------+------------+--------+-----------------+
|trip_distance|total_amount|features|       prediction|
+-------------+------------+--------+-----------------+
|         0.01|         3.3|  [0.01]|9.420298598289232|
|         0.01|         3.3|  [0.01]|9.420298598289232|
|         0.01|         3.3|  [0.01]|9.420298598289232|
|         0.01|         3.3|  [0.01]|9.420298598289232|
|         0.01|         3.3|  [0.01]|9.420298598289232|
|         0.01|         3.3|  [0.01]|9.420298598289232|
|         0.01|         3.3|  [0.01]|9.420298598289232|
|         0.01|         3.3|  [0.01]|9.420298598289232|
|         0.01|         3.3|  [0.01]|9.420298598289232|
|         0.01|         3.3|  [0.01]|9.420298598289232|
|         0.01|         3.3|  [0.01]|9.420298598289232|
|         0.01|         3.3|  [0.01]|9.420298598289232|
|         0.01|         3.3|  [0.01]|9.420298598289232|
|         0.01|         3.3|  [0.01]|9.420298598289232|
|         0.01|         3.8|  [0.01]|9.420298598

In [59]:
from pyspark.sql.types import DoubleType
distance_list = [1.1, 5.5, 10.5, 30.0]
distances_df = spark.createDataFrame(distance_list, DoubleType()).toDF("trip_distance")

In [60]:
distances_df.show()

+-------------+
|trip_distance|
+-------------+
|          1.1|
|          5.5|
|         10.5|
|         30.0|
+-------------+



In [61]:
vdistances_df = vassembler.transform(distances_df)

In [62]:
vdistances_df.show()

+-------------+--------+
|trip_distance|features|
+-------------+--------+
|          1.1|   [1.1]|
|          5.5|   [5.5]|
|         10.5|  [10.5]|
|         30.0|  [30.0]|
+-------------+--------+



In [63]:
model.transform(vdistances_df).show()

+-------------+--------+------------------+
|trip_distance|features|        prediction|
+-------------+--------+------------------+
|          1.1|   [1.1]|12.666196220094683|
|          5.5|   [5.5]| 25.76890221637357|
|         10.5|  [10.5]|40.658340848508665|
|         30.0|  [30.0]| 98.72715151383554|
+-------------+--------+------------------+



# 성능 평가

In [64]:
model.summary

<pyspark.ml.regression.LinearRegressionTrainingSummary at 0x7fe205918d30>

In [65]:
print("RMSE: ", model.summary.rootMeanSquaredError)

RMSE:  6.264524760859883


In [66]:
print("R2: ", model.summary.r2)
# R2:  0.018565176935511962

R2:  0.767718727019524


In [67]:
train_df.describe().show()

+-------+------------------+------------------+
|summary|     trip_distance|      total_amount|
+-------+------------------+------------------+
|  count|          10500134|          10500134|
|   mean|2.8817834020028044|17.972147144070217|
| stddev| 3.821344671677971|12.998135264013039|
|    min|              0.01|              0.01|
|    max|             475.5|            4973.3|
+-------+------------------+------------------+



In [68]:
spark.stop()