In [1]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .appName("Python Spark ml api tutorials") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

In [2]:
staticDataFrame = spark.read.format("csv")\
    .option("header","true")\
    .option("inferSchema", "true")\
    .load("../data/retail-data/by-day/*.csv")

In [3]:
staticDataFrame.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [6]:
from pyspark.sql.functions import date_format, col

preppedDataFrame = staticDataFrame\
    .na.fill(0)\
    .withColumn("day_of_week", date_format(col("InvoiceDate"), "EEEE"))\
    .coalesce(5)

In [7]:
# train test 데이터셋 분리
# 아직 액션 실행 X , 지연연산, 실행 계획
trainDataFrame = preppedDataFrame\
    .where("InvoiceDate < '2011-07-01'")
testDataFrame = preppedDataFrame\
    .where("InvoiceDate >= 2011-07-01")

In [8]:
trainDataFrame.count()
testDataFrame.count()

0

- 아래의 예제는 요일을 수치형으로 반환
- 토요일을 6, 월요일은 1
- 이는 암묵적으로 토요일이 월요일보타 더 크다는 것을 의미하므로 잘못됨.

In [9]:
from pyspark.ml.feature import StringIndexer
# 트렌스포메이션 자동화

indexer = StringIndexer()\
    .setInputCol("day_of_week")\
    .setOutputCol("day_of_week_index")

In [19]:
indexer

StringIndexer_5d62d5d8bb25

In [10]:
from pyspark.ml.feature import OneHotEncoder
# 원핫인코더를 활용해서 특정 요일이 해당 요일인지 아닌지 boolean 타입으로 나타냄
encoder = OneHotEncoder()\
    .setInputCol("day_of_week_index")\
    .setOutputCol("day_of_week_encoded")

In [18]:
encoder

OneHotEncoder_912801ad0f95

In [11]:
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler()\
    .setInputCols(["UnitPrice", "Quantity", "day_of_week_encoded"])\
    .setOutputCol("features")

In [17]:
vectorAssembler

VectorAssembler_488784c77d5a

In [12]:
from pyspark.ml import Pipeline

transformationPipeline = Pipeline()\
    .setStages([indexer, encoder, vectorAssembler])

In [20]:
transformationPipeline

Pipeline_b291cb029787

In [13]:
# 학습 데이터셋에 변환자(transformer)를 fit시킴
# 학습을 위한 맞춤 파이프라인이 준비됨
fittedPipeline = transformationPipeline.fit(trainDataFrame)

In [16]:
fittedPipeline

PipelineModel_7b16b8e3233a

In [14]:
transformedTraining = fittedPipeline.transform(trainDataFrame)

In [15]:
transformedTraining

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: double, Country: string, day_of_week: string, day_of_week_index: double, day_of_week_encoded: vector, features: vector]

In [21]:
# 동일한 트랜스포메이션을 계속 반복하지 않기 위해
# 모델에 일부 하이퍼파라미터 튜닝값을 적용
transformedTraining.cache()

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: double, Country: string, day_of_week: string, day_of_week_index: double, day_of_week_encoded: vector, features: vector]

In [23]:
# 모델 학습
# 모델 초기화 작업
from pyspark.ml.clustering import KMeans

kmeans = KMeans()\
    .setK(20)\
    .setSeed(1)

In [24]:
# 모델 학습
kmModel = kmeans.fit(transformedTraining)

In [25]:
kmModel.computeCost(transformedTraining)

transformedTest = fittedPipeline.transform(testDataFrame)

kmModel.computeCost(transformedTest)

AttributeError: 'KMeansModel' object has no attribute 'computeCost'