In [1]:
import findspark
findspark.init("/opt/cloudera/parcels/SPARK2-2.2.0.cloudera1-1.cdh5.12.0.p0.142354/lib/spark2/")

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('random_forest').getOrCreate()
import pandas



In [8]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pandas.tools.plotting import scatter_matrix
%matplotlib inline 
import matplotlib.pyplot as plt, numpy as np


In [9]:

# En esta sección se carga el dataset (dataframe) de datos
# este codigo puede ser cambiado por el spark.sql si tuviesemos la tabla cargada en Hive o Impala

trainning = spark.read.csv("/user/epinedac/datasets/iris", header=True, nullValue="?", inferSchema=True)
trainning.show()

+------------+-----------+------------+-----------+-------+-----+
|Sepal_Length|Sepal_Width|Petal_Length|Petal_Width|Species|label|
+------------+-----------+------------+-----------+-------+-----+
|         5.1|        3.5|         1.4|        0.2| setosa|    1|
|         4.9|        3.0|         1.4|        0.2| setosa|    1|
|         4.7|        3.2|         1.3|        0.2| setosa|    1|
|         4.6|        3.1|         1.5|        0.2| setosa|    1|
|         5.0|        3.6|         1.4|        0.2| setosa|    1|
|         5.4|        3.9|         1.7|        0.4| setosa|    1|
|         4.6|        3.4|         1.4|        0.3| setosa|    1|
|         5.0|        3.4|         1.5|        0.2| setosa|    1|
|         4.4|        2.9|         1.4|        0.2| setosa|    1|
|         4.9|        3.1|         1.5|        0.1| setosa|    1|
|         5.4|        3.7|         1.5|        0.2| setosa|    1|
|         4.8|        3.4|         1.6|        0.2| setosa|    1|
|         

In [10]:
training_data, test_data = trainning.randomSplit(weights=[0.7, 0.3], seed=12345)

In [11]:
assembler = VectorAssembler(inputCols=["Sepal_Length", "Sepal_Width", "Petal_Length", "Petal_Width"], outputCol="features")
assem_data = assembler.transform(training_data)

assembler_test = VectorAssembler(inputCols=["Sepal_Length", "Sepal_Width", "Petal_Length", "Petal_Width"], outputCol="features")
assem_data_test = assembler_test.transform(test_data)

In [12]:

train_scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True)
train_scaler_model = train_scaler.fit(assem_data)
scaled_data_train = train_scaler_model.transform(assem_data)

test_scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True)
test_scaler_model = test_scaler.fit(assem_data_test)
scaled_data_test = test_scaler_model.transform(assem_data_test)



In [13]:
rf = RandomForestClassifier(labelCol="label", featuresCol="scaled_features")
rfModel = rf.fit(scaled_data_train)

In [14]:
# Score test Data
predictions = rfModel.transform(scaled_data_test)
predictions.show()

+------------+-----------+------------+-----------+----------+-----+-----------------+--------------------+--------------------+--------------------+----------+
|Sepal_Length|Sepal_Width|Petal_Length|Petal_Width|   Species|label|         features|     scaled_features|       rawPrediction|         probability|prediction|
+------------+-----------+------------+-----------+----------+-----+-----------------+--------------------+--------------------+--------------------+----------+
|         4.5|        2.3|         1.3|        0.3|    setosa|    1|[4.5,2.3,1.3,0.3]|[-1.6550481964773...|  [0.0,19.0,1.0,0.0]| [0.0,0.95,0.05,0.0]|       1.0|
|         4.6|        3.1|         1.5|        0.2|    setosa|    1|[4.6,3.1,1.5,0.2]|[-1.5356676708297...|  [0.0,20.0,0.0,0.0]|   [0.0,1.0,0.0,0.0]|       1.0|
|         4.6|        3.4|         1.4|        0.3|    setosa|    1|[4.6,3.4,1.4,0.3]|[-1.5356676708297...|  [0.0,20.0,0.0,0.0]|   [0.0,1.0,0.0,0.0]|       1.0|
|         4.6|        3.6|        

In [16]:
#codigo para obtener la matriz de confusión
predictions.createOrReplaceTempView("rf_modelpd_2")

In [17]:
spark.sql("""
            select label as real, prediction, count(1) as cantidad
              from rf_modelpd_2
              group by label, prediction
              order by 1, 2 asc
          """).show()

+----+----------+--------+
|real|prediction|cantidad|
+----+----------+--------+
|   1|       1.0|      12|
|   2|       2.0|      12|
|   3|       2.0|       3|
|   3|       3.0|      17|
+----+----------+--------+



In [18]:
spark.stop()