In [1]:
import findspark
findspark.init()

import import_ipynb
from utils import rename

import pyspark.sql.types as tp
from pyspark import SparkContext, SparkFiles
from pyspark.sql import SQLContext
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression

importing Jupyter notebook from utils.ipynb


In [2]:
sc = SparkContext()

url = 'https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv'
sc.addFile(url)
sqlContext = SQLContext(sc)

In [3]:
schema_setting = tp.StructType([
    tp.StructField(name='sepal.length', dataType=tp.DoubleType(), nullable=False),
    tp.StructField(name='sepal.width', dataType=tp.DoubleType(), nullable=False),
    tp.StructField(name='petal.length', dataType=tp.DoubleType(), nullable=True),
    tp.StructField(name='petal.width', dataType=tp.DoubleType(), nullable=False),
    tp.StructField(name='variety', dataType=tp.StringType(), nullable=False)
])

data = sqlContext.read.csv(
    SparkFiles.get("iris.csv"), 
    schema=schema_setting,
    header=True
)

In [4]:
data.show(3)

+------------+-----------+------------+-----------+-------+
|sepal.length|sepal.width|petal.length|petal.width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|
|         4.9|        3.0|         1.4|        0.2| Setosa|
|         4.7|        3.2|         1.3|        0.2| Setosa|
+------------+-----------+------------+-----------+-------+
only showing top 3 rows



In [5]:
rename_data = rename(data, data.columns, ['x1', 'x2', 'x3', 'x4', 'y'])
rename_data.show(3)

+---+---+---+---+------+
| x1| x2| x3| x4|     y|
+---+---+---+---+------+
|5.1|3.5|1.4|0.2|Setosa|
|4.9|3.0|1.4|0.2|Setosa|
|4.7|3.2|1.3|0.2|Setosa|
+---+---+---+---+------+
only showing top 3 rows



In [6]:
split = rename_data.randomSplit([0.6, 0.4])

train_data = split[0]
test_data = split[1]

In [7]:
# string -> category
string_indexer = StringIndexer(inputCol='y', outputCol='label')

# each x value -> vector
vector_assembler = VectorAssembler(inputCols=['x1', 'x2', 'x3', 'x4'], outputCol='features')

# generage model
logistic_regression = LogisticRegression(featuresCol='features', labelCol='label')

# combine
pipeline = Pipeline(stages=[string_indexer, vector_assembler, logistic_regression])

# create model
model = pipeline.fit(train_data)

In [8]:
pred = model.transform(test_data)
pred.select('label', 'prediction').groupBy('label', 'prediction').count().sort('label').show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  0.0|       0.0|   17|
|  1.0|       1.0|   21|
|  2.0|       2.0|   23|
|  2.0|       1.0|    1|
+-----+----------+-----+

