In [1]:
import findspark
findspark.init()

import pyspark.sql.types as tp
from pyspark import SparkContext, SparkFiles
from pyspark.sql import SQLContext, SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [2]:
sc = SparkContext()

url = 'https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv'
sc.addFile(url)
sqlContext = SQLContext(sc)


In [3]:
# 변수에 ".", "-" 이 들어가 있으면 골치 아픈일이 많이 발생한다.....맘편하게 처리를 해주자 
schema_setting = tp.StructType([
    tp.StructField(name='sepal_length', dataType=tp.DoubleType(), nullable=False),
    tp.StructField(name='sepal_width', dataType=tp.DoubleType(), nullable=False),
    tp.StructField(name='petal_length', dataType=tp.DoubleType(), nullable=True),
    tp.StructField(name='petal_width', dataType=tp.DoubleType(), nullable=False),
    tp.StructField(name='variety', dataType=tp.StringType(), nullable=False)
])

data = sqlContext.read.csv(
    SparkFiles.get("iris.csv"), 
    header=True,
    schema=schema_setting
)

In [4]:

# x, y 변수 구분 
column_types = data.dtypes

# string type 변수 확인 
string_columns = [column_tpye[0] for column_tpye in column_types if column_tpye[1] == 'string']

transform_data = data 
for column in string_columns:
    string_indexer = StringIndexer(inputCol=column, outputCol='label')
    transform_data = string_indexer.fit(transform_data).transform(transform_data)
    transform_data = transform_data.drop(column)

x = transform_data.columns
x.remove('label')

In [5]:

# 각각의 x를 하나의 벡터로 생성
vector_assembler = VectorAssembler(inputCols=x, outputCol='features')

# 모델 선언 
logistic_regression = LogisticRegression(featuresCol='features', labelCol='label')

# pipeline 생성
pipeline = Pipeline(stages=[vector_assembler, logistic_regression])

# 모델 생성
model = pipeline.fit(transform_data)

In [6]:
paramGrid = ParamGridBuilder() \
    .addGrid(logistic_regression.aggregationDepth,[2,5,10]) \
    .addGrid(logistic_regression.elasticNetParam,[0.0, 0.5, 1.0]) \
    .addGrid(logistic_regression.fitIntercept,[False, True]) \
    .addGrid(logistic_regression.regParam,[0.01, 0.5, 2.0]) \
.build()

# uses 2/3 of the data for training and 1/3 for testing.
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3 # k 
                         )  

# cross validation을 적용하려면 y(target)값의 이름이 반드시 'label'이여야하는 안타까움이 있다.
cvModel = crossval.fit(transform_data)


In [19]:
cvModel.getParam

<bound method Params.getParam of CrossValidatorModel_34bdb7a2aff3>