In [6]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
MAX_MEMORY="5g"
spark = SparkSession.builder.appName("SparkML-test")\
                .config("spark.executor.memory", MAX_MEMORY)\
                .config("spark.driver.memory", MAX_MEMORY)\
                .getOrCreate()

22/03/11 14:06:17 WARN Utils: Your hostname, KRAFTONui-MacBookPro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.17 instead (on interface en0)
22/03/11 14:06:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/11 14:06:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/03/11 14:06:18 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
trip_files = "/Users/krafton/project/personal/spark-airflow-hands-on/data/trips/*"
zone_file = "/Users/krafton/project/personal/spark-airflow-hands-on/data/taxi+_zone_lookup.csv"

In [58]:
trips_df = spark.read.csv(f"file:///{trip_files}", inferSchema=True, header=True)
trips_df.createOrReplaceTempView("trips")

                                                                                

In [59]:
query = """
SELECT 
    passenger_count,
    PULocationID as pickup_location_id,
    DOLocationID as dropoff_location_id,
    trip_distance,
    HOUR(tpep_pickup_datetime) as pickup_time,
    DATE_FORMAT(TO_DATE(tpep_pickup_datetime), 'EEEE') AS day_of_week,
    total_amount
FROM
    trips
WHERE
    total_amount < 5000
    AND total_amount > 0
    AND trip_distance > 0
    AND trip_distance < 500
    AND passenger_count < 4
    AND TO_DATE(tpep_pickup_datetime) >= '2021-01-01'
    AND TO_DATE(tpep_pickup_datetime) < '2021-08-01'
"""
data_df = spark.sql(query)
data_df.createOrReplaceTempView("data")

In [60]:
data_df = spark.sql(query)
data_df.createOrReplaceTempView("data")
data_df.describe().show()



+-------+------------------+------------------+-------------------+------------------+------------------+-----------+------------------+
|summary|   passenger_count|pickup_location_id|dropoff_location_id|     trip_distance|       pickup_time|day_of_week|      total_amount|
+-------+------------------+------------------+-------------------+------------------+------------------+-----------+------------------+
|  count|          13126040|          13126040|           13126040|          13126040|          13126040|   13126040|          13126040|
|   mean|1.2114008489993935|166.21161279411004| 163.70421993228726|2.8820930920520915|14.206110144415224|       null|17.973158757890285|
| stddev|0.5424025151958398| 65.92863093005907|  70.62329603504969| 3.820306480671185| 5.118095829304276|       null|12.975904680786682|
|    min|                 0|                 1|                  1|              0.01|                 0|     Friday|              0.01|
|    max|                 3|             

                                                                                

# Randomly Data Split for train and test

In [61]:
train_df, test_df = data_df.randomSplit([0.8, 0.2], seed=1)

                                                                                

10500253




2625787


                                                                                

In [63]:
data_dir = '/Users/krafton/project/personal/spark-airflow-hands-on/data/model_data/'

train_df.write.format("parquet").save(f"{data_dir}/train/")
test_df.write.format("parquet").save(f"{data_dir}/test/")

In [66]:
train_df = spark.read.parquet(f"{data_dir}/train/")
test_df = spark.read.parquet(f"{data_dir}/test/")

In [67]:
train_df.printSchema()

root
 |-- passenger_count: integer (nullable = true)
 |-- pickup_location_id: integer (nullable = true)
 |-- dropoff_location_id: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_time: integer (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- total_amount: double (nullable = true)



# 1. Feature Engineering
- pre-processing
  - categorical attribution -> one-hot encoding
  - numerical attribution -> scaled

In [123]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

cat_feats = [
    "pickup_location_id",
    "dropoff_location_id",
    "day_of_week"
]

stages = []

for c in cat_feats:
    cat_indexer = StringIndexer(inputCol=c, outputCol=c + "_idx").setHandleInvalid("keep")
    onehot_encoder = OneHotEncoder(inputCols=[cat_indexer.getOutputCol()], outputCols=[c+"_onehot"])
    stages += [cat_indexer, onehot_encoder]

In [124]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

num_feats = [
    "passenger_count",
    "trip_distance",
    "pickup_time"
]

for n in num_feats:
    num_assembler = VectorAssembler(inputCols=[n], outputCol= n + "_vector")
    num_scaler = StandardScaler(inputCol=num_assembler.getOutputCol(), outputCol= n + "_scaled")
    stages += [num_assembler, num_scaler]

In [125]:
assembler_inputs = [c + "_onehot" for c in cat_feats] + [n + "_scaled" for n in num_feats]
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="feature_vector")
stages += [assembler]
stages

[StringIndexer_076fc829c6ab,
 OneHotEncoder_37910b92b78a,
 StringIndexer_6226db406e91,
 OneHotEncoder_d4f43edcfa36,
 StringIndexer_c824080a8cc0,
 OneHotEncoder_32fcbb34a904,
 VectorAssembler_954291f2e685,
 StandardScaler_3b989430f56c,
 VectorAssembler_b9a359db1458,
 StandardScaler_71cc879e9f5f,
 VectorAssembler_a968cd8516f3,
 StandardScaler_34fa9f97d180,
 VectorAssembler_a1bb683d1332]

In [145]:
from pyspark.ml import Pipeline

transform_stages = stages
pipeline = Pipeline(stages=transform_stages)
fitted_transformer = pipeline.fit(train_df)

                                                                                

In [146]:
vtrain_df = fitted_transformer.transform(train_df)

In [147]:
vtrain_df.select("day_of_week", "day_of_week_idx", "day_of_week_onehot")\
    .distinct().show(10, False)



+-----------+---------------+------------------+
|day_of_week|day_of_week_idx|day_of_week_onehot|
+-----------+---------------+------------------+
|Saturday   |4.0            |(7,[4],[1.0])     |
|Friday     |0.0            |(7,[0],[1.0])     |
|Thursday   |1.0            |(7,[1],[1.0])     |
|Sunday     |6.0            |(7,[6],[1.0])     |
|Wednesday  |2.0            |(7,[2],[1.0])     |
|Tuesday    |3.0            |(7,[3],[1.0])     |
|Monday     |5.0            |(7,[5],[1.0])     |
+-----------+---------------+------------------+



                                                                                

In [133]:
vtrain_df.select("passenger_count", "passenger_count_vector", "passenger_count_scaled")\
    .distinct().show(10, False)



+---------------+----------------------+----------------------+
|passenger_count|passenger_count_vector|passenger_count_scaled|
+---------------+----------------------+----------------------+
|2              |[2.0]                 |[3.6871269716505233]  |
|0              |[0.0]                 |[0.0]                 |
|1              |[1.0]                 |[1.8435634858252616]  |
|3              |[3.0]                 |[5.530690457475785]   |
+---------------+----------------------+----------------------+



                                                                                

In [150]:
vtrain_df.select("feature_vector")\
    .distinct().show(10, False)



+--------------------------------------------------------------------------------+
|feature_vector                                                                  |
+--------------------------------------------------------------------------------+
|(533,[47,264,528,531,532],[1.0,1.0,1.0,1.8335617216255875,3.7124963951492775])  |
|(533,[68,330,527,531,532],[1.0,1.0,1.0,0.9691683385735249,4.4940745836017575])  |
|(533,[44,296,528,531,532],[1.0,1.0,1.0,0.23574364992328983,3.321707300923038])  |
|(533,[24,302,527,531,532],[1.0,1.0,1.0,0.7596184275306005,2.930918206696798])   |
|(533,[24,271,524,531,532],[1.0,1.0,1.0,0.1571624332821932,2.1493400182443185])  |
|(533,[60,289,527,531,532],[1.0,1.0,1.0,0.7858121664109661,0.39078909422623975]) |
|(533,[11,273,527,531,532],[1.0,1.0,1.0,0.05238747776073108,0.39078909422623975])|
|(533,[11,285,523,531,532],[1.0,1.0,1.0,0.4190998220858486,2.3447345653574385])  |
|(533,[37,274,527,531,532],[1.0,1.0,1.0,0.6286497331287728,3.7124963951492775])  |
|(53

                                                                                

# 2. LinearRegression Modeling

In [155]:
from pyspark.ml.regression import LinearRegression

In [157]:
lr = LinearRegression(
    maxIter=50,
    solver='normal',
    labelCol="total_amount",
    featuresCol="feature_vector",
)

In [158]:
model = lr.fit(vtrain_df)

22/03/11 15:18:33 WARN Instrumentation: [52518dd3] regParam is zero, which might cause numerical instability and overfitting.
22/03/11 15:18:46 WARN Instrumentation: [52518dd3] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/03/11 15:18:46 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/03/11 15:18:46 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

In [169]:
model.summary.r2

0.8084901250081301

In [170]:
model.summary.rootMeanSquaredError

5.648520165266761

# 3. Test

In [163]:
vtest_df = fitted_transformer.transform(test_df)

In [165]:
prediction = model.transform(vtest_df)
prediction.cache()

DataFrame[passenger_count: int, pickup_location_id: int, dropoff_location_id: int, trip_distance: double, pickup_time: int, day_of_week: string, total_amount: double, pickup_location_id_idx: double, pickup_location_id_onehot: vector, dropoff_location_id_idx: double, dropoff_location_id_onehot: vector, day_of_week_idx: double, day_of_week_onehot: vector, passenger_count_vector: vector, passenger_count_scaled: vector, trip_distance_vector: vector, trip_distance_scaled: vector, pickup_time_vector: vector, pickup_time_scaled: vector, feature_vector: vector, prediction: double]

In [167]:
prediction.select(["trip_distance", "day_of_week", "total_amount", "prediction"])\
    .show(5)

+-------------+-----------+------------+------------------+
|trip_distance|day_of_week|total_amount|        prediction|
+-------------+-----------+------------+------------------+
|          1.6|   Thursday|        12.3|15.150622157886815|
|          3.3|   Saturday|       23.15|20.815012529756736|
|          4.1|  Wednesday|        16.3|  15.2287766581555|
|          0.4|   Thursday|         5.8|  7.50328020386138|
|         15.4|   Thursday|        65.3| 46.03403283496961|
+-------------+-----------+------------+------------------+
only showing top 5 rows



# 4. Hyper Parameter Tuning

In [171]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

In [172]:
lr = LinearRegression(
    maxIter=30,
    solver="normal",
    labelCol='total_amount',
    featuresCol='feature_vector'
)

cv_stages = stages + [lr]

In [173]:
cv_pipeline = Pipeline(stages=cv_stages)

In [175]:
param_grid = ParamGridBuilder()\
                .addGrid(lr.elasticNetParam, [0.1, 0.2, 0.3, 0.4, 0.5])\
                .addGrid(lr.regParam, [0.01, 0.02, 0.03, 0.04, 0.05])\
                .build()

In [176]:
cross_val = CrossValidator(estimator=cv_pipeline,
                           estimatorParamMaps=param_grid,
                           evaluator=RegressionEvaluator(labelCol="total_amount"),
                           numFolds=5)

In [177]:
toy_df = train_df.sample(False, 0.1, seed=1)
cv_model = cross_val.fit(toy_df)

                                                                                

In [178]:
alpha = cv_model.bestModel.stages[-1]._java_obj.getElasticNetParam()
reg_param = cv_model.bestModel.stages[-1]._java_obj.getRegParam()
print(alpha)
print(reg_param)

0.1
0.03


# 5. 최종 모델
- hyperparameter 튜닝 후 적용
- 모델 저장하고 불러오기

In [179]:
lr = LinearRegression(
    maxIter=50,
    solver="normal",
    labelCol="total_amount",
    featuresCol="feature_vector",
    elasticNetParam=alpha,
    regParam=reg_param,
)

model = lr.fit(vtrain_df)
model.summary.r2

                                                                                

0.8084602992197699

In [181]:
model_dir = '/Users/krafton/project/personal/spark-airflow-hands-on/data/model/'
model.save(model_dir)

In [182]:
from pyspark.ml.regression import LinearRegressionModel

lr_model = LinearRegressionModel().load(model_dir)

In [183]:
lr_model

LinearRegressionModel: uid=LinearRegression_a0df54742b30, numFeatures=533