# from pyspark.sql import SparkSession

In [45]:
MAX_MEMORY = "5g"
spark = SparkSession.builder.appName("trip_count_by_zone_sql")\
    .config("spark.executor.memory", MAX_MEMORY)\
    .config("spark.driver.memory", MAX_MEMORY)\
    .getOrCreate()

In [46]:
trip_files = "/Users/keon/fastcampus/data-engineering/01-spark/data/trips/*"

In [47]:
trips_df = spark.read.csv(f"file:///{trip_files}", inferSchema = True, header = True)

In [48]:
trips_df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [7]:
trips_df.createOrReplaceTempView("trips")

In [63]:
query = """
    SELECT 
        trip_distance,
        duration_sec
    FROM 
    (SELECT 
        *,
        TO_DATE(t.tpep_pickup_datetime) as pickup_date,
        UNIX_TIMESTAMP(t.tpep_dropoff_datetime) - UNIX_TIMESTAMP(t.tpep_pickup_datetime) as duration_sec
    FROM 
        trips t)
    WHERE
        total_amount < 5000
        AND total_amount > 0
        AND trip_distance < 500
        AND passenger_count < 4
        AND pickup_date >= '2021-01-01'
        AND pickup_date < '2021-08-01'
        AND trip_distance > 0
        AND duration_sec > 0
"""
training = spark.sql(query)
training.createOrReplaceTempView("training")

In [64]:
combo_df.show()

+-------------+------------+
|trip_distance|duration_sec|
+-------------+------------+
|          2.1|         362|
|          0.2|          59|
|         14.7|        1656|
|         10.6|         913|
|         4.94|         992|
|          1.6|         481|
|          4.1|        1020|
|          5.7|        1085|
|          9.1|        1257|
|          2.7|         814|
|         6.11|        1335|
|         1.21|         429|
|          7.4|        1332|
|          1.7|         466|
|         0.81|         133|
|         1.01|         247|
|         0.73|         299|
|         1.17|         297|
|         0.78|         217|
|         1.66|         514|
+-------------+------------+
only showing top 20 rows



# ML

In [65]:
train_df, test_df = model_df.randomSplit([0.8, 0.2], seed=2019)
# toy_df = train_df.sample(False, .1, seed=261)
print("Train set count: ", train_df.count())
print("Test set count:", test_df.count())

Train set count:  11133546
Test set count: 2784251


In [66]:
from pyspark.ml.feature import VectorAssembler
# VectorAssembler 여러개의 값을 하나로
vassembler = VectorAssembler(inputCols = ['trip_distance'], outputCol = 'features')
vtrain_df = vassembler.transform(train_df)
vtrain_df.show()

+-------------+------------+--------+
|trip_distance|duration_sec|features|
+-------------+------------+--------+
|         0.01|           2|  [0.01]|
|         0.01|           2|  [0.01]|
|         0.01|           2|  [0.01]|
|         0.01|           2|  [0.01]|
|         0.01|           2|  [0.01]|
|         0.01|           2|  [0.01]|
|         0.01|           2|  [0.01]|
|         0.01|           2|  [0.01]|
|         0.01|           2|  [0.01]|
|         0.01|           3|  [0.01]|
|         0.01|           3|  [0.01]|
|         0.01|           3|  [0.01]|
|         0.01|           3|  [0.01]|
|         0.01|           3|  [0.01]|
|         0.01|           3|  [0.01]|
|         0.01|           3|  [0.01]|
|         0.01|           3|  [0.01]|
|         0.01|           3|  [0.01]|
|         0.01|           4|  [0.01]|
|         0.01|           4|  [0.01]|
+-------------+------------+--------+
only showing top 20 rows



In [67]:
from pyspark.ml.regression import LinearRegression

In [68]:
lr = LinearRegression(maxIter=100,
                      labelCol="duration_sec",
                      featuresCol="features")

In [69]:
model = lr.fit(vtrain_df)

In [70]:
from pyspark.ml.evaluation import RegressionEvaluator

In [71]:
transformed_test_df = vassembler.transform(test_df)
transformed_test_df = transformed_test_df.select("duration_sec", "features").cache()
transformed_test_df = model.transform(transformed_test_df).cache()

In [72]:
evaluator = RegressionEvaluator(
    labelCol="duration_sec",
    metricName="rmse"
)
rmse = evaluator.evaluate(transformed_test_df)
print("RMSE of Prediction on test set:", rmse)

RMSE of Prediction on test set: 3635.8085573768494


In [73]:
transformed_test_df.show()

+------------+--------+-----------------+
|duration_sec|features|       prediction|
+------------+--------+-----------------+
|           2|  [0.01]|577.0987407056483|
|           3|  [0.01]|577.0987407056483|
|           3|  [0.01]|577.0987407056483|
|           3|  [0.01]|577.0987407056483|
|           4|  [0.01]|577.0987407056483|
|           4|  [0.01]|577.0987407056483|
|           4|  [0.01]|577.0987407056483|
|           4|  [0.01]|577.0987407056483|
|           4|  [0.01]|577.0987407056483|
|           4|  [0.01]|577.0987407056483|
|           4|  [0.01]|577.0987407056483|
|           5|  [0.01]|577.0987407056483|
|           5|  [0.01]|577.0987407056483|
|           5|  [0.01]|577.0987407056483|
|           5|  [0.01]|577.0987407056483|
|           6|  [0.01]|577.0987407056483|
|           7|  [0.01]|577.0987407056483|
|           7|  [0.01]|577.0987407056483|
|           7|  [0.01]|577.0987407056483|
|           7|  [0.01]|577.0987407056483|
+------------+--------+-----------

In [74]:
from pyspark.sql.types import DoubleType

distance_list = [100., 1000.]
distances_df = spark.createDataFrame(distance_list, DoubleType()).toDF('trip_distance')


In [75]:
vdistances_df = vassembler.transform(distances_df)

In [76]:
model.transform(vdistances_df).show()

+-------------+--------+------------------+
|trip_distance|features|        prediction|
+-------------+--------+------------------+
|        100.0| [100.0]| 12432.59504771289|
|       1000.0|[1000.0]|119142.73282455574|
+-------------+--------+------------------+



In [80]:
12432.59504771289 / 60 / 60

3.4534986243646917