In [None]:
from pyspark import SparkContext

In [None]:
sc = SparkContext()

# Machine Learning en Spark

## Algoritmos disponibles para:
### - Clasficación (Sup)
### - Regresión (Sup)
### - Collaborative Filtering (N.S.)
### - Clustering (N.S.)




----

## Spark MLlib
La implementación SparkMLlib (disponible desde Spark 0.8) provee funciones de ML para usarse en RDDs. MLlib, al igual que Spark Streaming y Spark SQL son componentes integrales de Spark.


## Spark ML
A partir de Spark 1.2, una librería adicional, Spark ML, se introduce para extender MLlib a DataFrames de Spark SQL. Las APIs, algoritmos y otras funciones están basadas en MLlib. Es la primera elección si trabajamos con SparkSQL.

Los conceptos e implementaciones son intercambiables entre ambas APIs.


----

## Clasificación

### Decision Trees

In [None]:
data = sc.parallelize([ 1,2,3,4,5,6,7,8,9,10]) 
training, test = data.randomSplit([ 0.6, 0.4]) 

In [None]:
training.collect()

In [None]:
test.collect()

In [None]:
from pyspark.mllib.regression import LabeledPoint 


# Golf Dataset: https://gerardnico.com/wiki/data_mining/weather
outlook = {" sunny": 0.0, " overcast": 1.0, " rainy": 2.0} 

labeledpoints = [ 
    LabeledPoint( 0.0, [outlook[" sunny"], 85,85, False]), 
    LabeledPoint( 0.0, [outlook[" sunny"], 80,90, True]), 
    LabeledPoint( 1.0, [outlook[" overcast"], 83,86, False]),
    LabeledPoint( 1.0, [outlook[" rainy"], 70,96, False]), 
    LabeledPoint( 1.0, [outlook[" rainy"], 68,80, False]), 
    LabeledPoint( 0.0, [outlook[" rainy"], 65,70, True]), 
    LabeledPoint( 1.0, [outlook[" overcast"], 64,65, True]), 
    LabeledPoint( 0.0, [outlook[" sunny"], 72,95, False]), 
    LabeledPoint( 1.0, [outlook[" sunny"], 69,70, False]), 
    LabeledPoint( 1.0, [outlook[" sunny"], 75,80, False]), 
    LabeledPoint( 1.0, [outlook[" sunny"], 75,70, True]), 
    LabeledPoint( 1.0, [outlook[" overcast"], 72,90, True]), 
    LabeledPoint( 1.0, [outlook[" overcast"], 81,75, False]), 
    LabeledPoint( 0.0, [outlook[" rainy"], 71,91, True]) ] 


data = sc.parallelize( labeledpoints)



In [None]:
from pyspark.mllib.tree import DecisionTree 

model = DecisionTree.trainClassifier( data = data, numClasses = 2, categoricalFeaturesInfo ={ 0: 3})


In [None]:
print(model.toDebugString())

In [None]:
model.numNodes()

In [None]:
model.predict([ 1.0,85,85, True])

### Naive Bayes

In [None]:
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel 


model = NaiveBayes.train( data = data, lambda_ = 1.0) 



model.predict([ 1.0,85,85, True])



## Collaborative Filtering (AKA Matrix Factorization)

![](https://upload.wikimedia.org/wikipedia/commons/5/52/Collaborative_filtering.gif)

In [None]:
!wget https://s3.amazonaws.com/sty-spark/movielens/movielens.dat

In [None]:
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating 

In [None]:
data = sc.textFile("movielens.dat") 


ratings = data.map(lambda x: x.split('\ t'))

ratings = ratings.map(lambda x: Rating( int( x[ 0]), int( x[ 1]), float( x[ 2])))



In [None]:
data.take(2)

In [None]:
ratings.count()

In [None]:
type(ratings)

In [None]:
# rank = 10 
# numIterations = 10 
# model = ALS.train(ratings, rank, numIterations)


In [None]:
testdata = ratings.map( lambda p: (p[ 0], p[ 1])) 

predictions = model.predictAll( testdata) \
    .map( lambda r: (( r[ 0], r[ 1]), r[ 2])) 
    
ratesAndPreds = ratings.map( lambda r: (( r[ 0], r[ 1]), r[ 2])) \
    .join( predictions) 
    
MSE = ratesAndPreds.map( lambda r: (r[ 1][ 0] - r[ 1][ 1])** 2) \
    .mean() 
    
    
    
print(" Mean Squared Error = " + str( MSE))



In [None]:
type(testdata)

In [None]:
from pyspark.mllib.clustering import KMeans, KMeansModel 

from numpy import array 

from math import sqrt
data = sc.textFile("/usr/local/spark/data/mllib/kmeans_data.txt")



In [None]:
parsedData = data.map( lambda line: array( \
                                          [float( x) for x in line.split(' ')])) 
# Build the model (cluster the data) 
clusters = KMeans.train( parsedData, 2, maxIterations = 10, runs = 10, initializationMode =" random")



In [None]:
# Evaluate clustering by computing Within Set Sum of Squared Errors 
def error( point): 
    center = clusters.centers[ clusters.predict( point)]
    return sqrt( sum([ x** 2 for x in (point - center)])) 




WSSSE = parsedData.map( lambda point: error( point)) \
    .reduce( lambda x, y: x + y) 
    
    
print(" Within Set Sum of Squared Error = " + str( WSSSE))


In [None]:
clusters.save(path='./my_clusters_model.spark_model', sc=sc)

In [None]:
# load model

cluster_izm = KMeansModel.load(sc, path='./my_clusters_model.spark_model')

In [None]:
type(cluster_izm)