In [1]:
from pyspark.sql import SparkSession

MAX_MEMORY="5g"
spark = SparkSession.builder.appName("taxi-fare-prediciton")\
                .config("spark.executor.memory", MAX_MEMORY)\
                .config("spark.driver.memory", MAX_MEMORY)\
                .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/05 14:02:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
data_dir = "/home/ubuntu/working/spark-examples/data/ml_data_taxi" # 로컬 디렉토리로 지정. hdfs를 사용 할 수도 있다!

train_df = spark.read.parquet(f"{data_dir}/train/")
test_df  = spark.read.parquet(f"{data_dir}/test/")

                                                                                

In [3]:
stages = []

from pyspark.ml.feature import StringIndexer, OneHotEncoder

# OneHotEncoding을 수행할 컬럼을 지정
cat_features = [
    "pickup_location_id",
    "dropoff_location_id",
    "day_of_week"
]

for c in cat_features:
    # 1. 데이터를 문자열 형식으로 바꿔준다. setHandleInvalid : Null값 같은 데이터를 어떻게 처리 할건지
    cat_indexer = StringIndexer(inputCol=c, outputCol=c+"_idx").setHandleInvalid("keep")
    
    # 2. One Hot Encoding 수행
    onehot_encoder = OneHotEncoder(inputCols=[cat_indexer.getOutputCol()], outputCols=[c+"_onehot"])
    
    stages += [cat_indexer, onehot_encoder]

stages

[StringIndexer_8de26289538f,
 OneHotEncoder_a2b8abb24f26,
 StringIndexer_f63fb8bc8813,
 OneHotEncoder_0cbde98fb496,
 StringIndexer_bd07ebed515b,
 OneHotEncoder_35d56cb78c6c]

In [4]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

num_features = [
    "passenger_count",
    "trip_distance",
    "pickup_time"
]

for n in num_features:
    
    # 각각의 컬럼의 데이터가 벡터화. ex) 1.5 -> [1.5]
    num_assembler = VectorAssembler(inputCols=[n], outputCol=n+"_vector")
    
    # StandardScaling 수행
    num_scaler = StandardScaler(inputCol=num_assembler.getOutputCol(), outputCol=n+"_scaled")
    
    stages += [num_assembler, num_scaler]

stages

[StringIndexer_8de26289538f,
 OneHotEncoder_a2b8abb24f26,
 StringIndexer_f63fb8bc8813,
 OneHotEncoder_0cbde98fb496,
 StringIndexer_bd07ebed515b,
 OneHotEncoder_35d56cb78c6c,
 VectorAssembler_60464bdd3272,
 StandardScaler_6632593e6ea4,
 VectorAssembler_b7a09a801998,
 StandardScaler_3c96c5bd2bce,
 VectorAssembler_e0010bfe3fa8,
 StandardScaler_842e7df723c5]

In [5]:
# _onehot이 붙은 컬럼과 _scaled 가 붙은 컬럼만 있으면 된다.
assembler_inputs = [c + "_onehot" for c in cat_features] + [n + "_scaled" for n in num_features]
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="feature_vector")
stages += [assembler]

stages

[StringIndexer_8de26289538f,
 OneHotEncoder_a2b8abb24f26,
 StringIndexer_f63fb8bc8813,
 OneHotEncoder_0cbde98fb496,
 StringIndexer_bd07ebed515b,
 OneHotEncoder_35d56cb78c6c,
 VectorAssembler_60464bdd3272,
 StandardScaler_6632593e6ea4,
 VectorAssembler_b7a09a801998,
 StandardScaler_3c96c5bd2bce,
 VectorAssembler_e0010bfe3fa8,
 StandardScaler_842e7df723c5,
 VectorAssembler_5f623cce4331]

# 하이퍼 파라미터
- 모델이 학습하지 못하는 파라미터
- 사람이 직접 모델에 넣어주는 파라미터
- 하이퍼 파라미터에 따라서 모델의 성능이 바뀔 수 있다.
    - 적절한 최적의 파라미터를 찾아야 한다.

In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression

In [7]:
lr = LinearRegression(
    maxIter=50,
    solver='normal',
    labelCol="total_amount",
    featuresCol='feature_vector'
)

cv_stages = stages + [lr]

In [8]:
cv_stages

[StringIndexer_8de26289538f,
 OneHotEncoder_a2b8abb24f26,
 StringIndexer_f63fb8bc8813,
 OneHotEncoder_0cbde98fb496,
 StringIndexer_bd07ebed515b,
 OneHotEncoder_35d56cb78c6c,
 VectorAssembler_60464bdd3272,
 StandardScaler_6632593e6ea4,
 VectorAssembler_b7a09a801998,
 StandardScaler_3c96c5bd2bce,
 VectorAssembler_e0010bfe3fa8,
 StandardScaler_842e7df723c5,
 VectorAssembler_5f623cce4331,
 LinearRegression_ad26a366029e]

In [9]:
cv_pipeline = Pipeline(stages=cv_stages)
cv_pipeline

Pipeline_79f2195c0218

# GridSearch 및 CrossValidation 설정

In [10]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

param_grid = ParamGridBuilder()\
                .addGrid(lr.elasticNetParam, [0.1, 0.2, 0.3, 0.4, 0.5])\
                .addGrid(lr.regParam, [0.01, 0.02, 0.03, 0.04, 0.05])\
                .build()

cross_val = CrossValidator(
    estimator=cv_pipeline, # 파이프라인을 estimator로 사용하는 경우 제일 마지막 스테이지가 모델이어야 한다.
    estimatorParamMaps=param_grid, # 설정하지 않으면 GridSearch 없이 Cross Validation만 진행
    evaluator=RegressionEvaluator(labelCol="total_amount"),
    numFolds=5
)

cross_val

CrossValidator_d78ac463937a

# 훈련

In [11]:
# 임의의 샘플 데이터 세트 만들기. 전체로 다 하면 시간이 많이 걸려서....
toy_df = train_df.sample(False, 0.1, seed=1)
toy_df.printSchema()

root
 |-- passenger_count: integer (nullable = true)
 |-- pickup_location_id: integer (nullable = true)
 |-- dropoff_location_id: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_time: integer (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- total_amount: double (nullable = true)



In [12]:
cv_model = cross_val.fit(toy_df)
cv_model

23/08/05 14:03:46 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/08/05 14:03:46 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
23/08/05 14:03:49 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
23/08/05 14:03:49 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
[Stage 230:>                                                        (0 + 2) / 2]

# BestModel 찾기

In [None]:
best_model = cv_model.bestModel

# Best Parameter 찾기

In [None]:
best_alpha = cv_model.bestModel.stages[-1]._java_obj.getElasticNetParam()
best_reg_param = cv_model.bestModel.stages[-1]._java_obj.getRegParam()

# 전체 데이터를 대상으로 훈련

In [None]:
pipeline = Pipeline(stages=stages) # 모델이 빠진 순수하게 전처리만 하는 파이프라인 생성
fitted_transformer = pipeline.fit(train_df)

In [None]:
vec_train_df = fitted_transformer.transform(train_df)

In [None]:
# best paramter로 모델 생성
lr = LinearRegression(
    maxIter=50,
    solver='normal',
    labelCol='total_amount',
    featuresCol='feature_vector',
    elasticNetParam=best_alpha,
    regParam=best_reg_param
)

model = lr.fit(vec_train_df)

# 튜닝된 모델 저장 및 불러오기

In [None]:
model_dir = "/home/ubuntu/working/spark-examples/taxi_pricing_model"
model.save(model_dir)

In [None]:
# 모델 로딩 시에는 사용한 모델 클래스를 따로 불러와서 사용
from pyspark.ml.regression import LinearRegression
loaded_model = LinearRegression().load(model_dir)

In [None]:
spark.stop()