# Jonathan Halverson
# Friday, May 13, 2016
# Random forest on wine data

In [9]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import RandomForest

Load the data from a CSV file and from RDDs of the features and labels:

In [10]:
import numpy as np
str_lines = sc.textFile('/Users/jhalverson/data_science/machine_learning/wine.csv')
data_labels = str_lines.map(lambda line: int(line.split(',')[0]) - 1)
data_features = str_lines.map(lambda line: np.array([float(x) for x in line.split(',')[1:]]))
print 'Total records:', data_features.count()

Total records: 178


Form an RDD of LabeledPoints to be passed to the train method of the RF model (note that scaling or standardization is not needed for this method):

In [11]:
data = data_labels.zip(data_features)
data = data.map(lambda x: LabeledPoint(x[0], [x[1]]))
data.take(2)

[LabeledPoint(0.0, [14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0]),
 LabeledPoint(0.0, [13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0])]

Perform a train-test split which is approximately stratified:

In [12]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1234)
train_data.persist(StorageLevel.DISK_ONLY)
train_data.map(lambda x: x.label).countByValue().items()

[(0.0, 50), (1.0, 56), (2.0, 41)]

Fit the model to the training data:

In [13]:
model = RandomForest.trainClassifier(train_data, numClasses=3, categoricalFeaturesInfo={}, numTrees=100,
                                     featureSubsetStrategy='sqrt', impurity='gini', maxBins=32)

Form RDDs of the features and labels of the test data:

In [14]:
test_data_features = test_data.map(lambda x: x.features)
test_data_labels = test_data.map(lambda x: x.label)
predictions = model.predict(test_data_features)

Compute the accuracy of the predictions:

In [15]:
ct = 0
for true, pred in zip(test_data_labels.collect(), predictions.collect()):
    if true == pred: ct += 1
print float(ct) / test_data_labels.count()

1.0


The accuracy is found to be 100 percent.