MLlib: RDD-based -- Otros ejemplos
===

* *30 min* | Última modificación: Noviembre 6, 2020

Inicialización de Spark
--

In [1]:
#
# Carga de las librerías de Spark
#
import findspark
from pyspark.sql import SparkSession

from pyspark import SparkConf, SparkContext

findspark.init()

APP_NAME = "spark-app"

conf = SparkConf().setAppName(APP_NAME)
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

Clustering
--

In [2]:
#
# Descarga
#
!wget https://raw.githubusercontent.com/apache/spark/master/data/mllib/kmeans_data.txt

--2020-11-07 16:35:55--  https://raw.githubusercontent.com/apache/spark/master/data/mllib/kmeans_data.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 199.232.48.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|199.232.48.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 72 [text/plain]
Saving to: ‘kmeans_data.txt’


2020-11-07 16:35:56 (490 KB/s) - ‘kmeans_data.txt’ saved [72/72]



In [3]:
#
# Contenido del archivo
#
!head kmeans_data.txt

0.0 0.0 0.0
0.1 0.1 0.1
0.2 0.2 0.2
9.0 9.0 9.0
9.1 9.1 9.1
9.2 9.2 9.2


In [4]:
#
# Mueve el archivo de datos al hdfs
#
!hdfs dfs -copyFromLocal kmeans_data.txt /tmp/kmeans_data.txt

In [5]:
from math import sqrt

from numpy import array
from pyspark.mllib.clustering import KMeans, KMeansModel

# Load and parse the data
data = sc.textFile("/tmp/kmeans_data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(" ")]))

# Build the model (cluster the data)
clusters = KMeans.train(parsedData, 2, maxIterations=10, initializationMode="random")

# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

clusters.save(sc, "/tmp/KMeansModel")
sameModel = KMeansModel.load(sc, "/tmp/KMeansModel")

Within Set Sum of Squared Error = 0.6928203230275529


Standard Scaler
--

In [6]:
#
# Descarga
#
!wget https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_libsvm_data.txt

--2020-11-07 16:36:20--  https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_libsvm_data.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 199.232.48.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|199.232.48.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 104736 (102K) [text/plain]
Saving to: ‘sample_libsvm_data.txt’


2020-11-07 16:36:21 (531 KB/s) - ‘sample_libsvm_data.txt’ saved [104736/104736]



In [7]:
#
# Contenido del archivo
#
!head sample_libsvm_data.txt

0 128:51 129:159 130:253 131:159 132:50 155:48 156:238 157:252 158:252 159:252 160:237 182:54 183:227 184:253 185:252 186:239 187:233 188:252 189:57 190:6 208:10 209:60 210:224 211:252 212:253 213:252 214:202 215:84 216:252 217:253 218:122 236:163 237:252 238:252 239:252 240:253 241:252 242:252 243:96 244:189 245:253 246:167 263:51 264:238 265:253 266:253 267:190 268:114 269:253 270:228 271:47 272:79 273:255 274:168 290:48 291:238 292:252 293:252 294:179 295:12 296:75 297:121 298:21 301:253 302:243 303:50 317:38 318:165 319:253 320:233 321:208 322:84 329:253 330:252 331:165 344:7 345:178 346:252 347:240 348:71 349:19 350:28 357:253 358:252 359:195 372:57 373:252 374:252 375:63 385:253 386:252 387:195 400:198 401:253 402:190 413:255 414:253 415:196 427:76 428:246 429:252 430:112 441:253 442:252 443:148 455:85 456:252 457:230 458:25 467:7 468:135 469:253 470:186 471:12 483:85 484:252 485:223 494:7 495:131 496:252 497:225 498:71 511:85 512:252 513:145 521:48 522:165 523:252 524:173 539:86

In [8]:
#
# Mueve el archivo de datos al hdfs
#
!hdfs dfs -copyFromLocal sample_libsvm_data.txt /tmp/sample_libsvm_data.txt

In [9]:
from pyspark.mllib.feature import StandardScaler
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.util import MLUtils

data = MLUtils.loadLibSVMFile(sc, "/tmp/sample_libsvm_data.txt")
label = data.map(lambda x: x.label)
features = data.map(lambda x: x.features)

scaler1 = StandardScaler().fit(features)
scaler2 = StandardScaler(withMean=True, withStd=True).fit(features)

# data1 will be unit variance.
data1 = label.zip(scaler1.transform(features))

# data2 will be unit variance and zero mean.
data2 = label.zip(scaler2.transform(features.map(lambda x: Vectors.dense(x.toArray()))))

data2.collect()[:2]

[(0.0,
  DenseVector([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1, -0.1, -0.1357, -0.1287, -0.1584, -0.1689, -0.1934, -0.1068, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1, -0.1, -0.1338, -0.136, -0.2777, -0.4448, -0.5367, -0.6242, -0.1413, 0.8975, 1.6834, 1.0786, 0.2778, -0.2784, -0.1424, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1, -0.1, -0.1238, -0.1741, -0.2875, -0.436, -0.6436, -0.804, -0.5563, 0.9701, 1.0756, 1.0633, 1.2936, 1.5017, -0.4409, -0.2643, -0.1677, -0.1, -0.1, 0.0, 0.0, 0.0, 0.0, 0

Normalizer
--

In [10]:
from pyspark.mllib.feature import Normalizer
from pyspark.mllib.util import MLUtils

data = MLUtils.loadLibSVMFile(sc, "/tmp/sample_libsvm_data.txt")
labels = data.map(lambda x: x.label)
features = data.map(lambda x: x.features)

normalizer1 = Normalizer()
normalizer2 = Normalizer(p=float("inf"))

# Each sample in data1 will be normalized using $L^2$ norm.
data1 = labels.zip(normalizer1.transform(features))

# Each sample in data2 will be normalized using $L^\infty$ norm.
data2 = labels.zip(normalizer2.transform(features))

data2.collect()[:2]

[(0.0,
  SparseVector(692, {127: 0.2, 128: 0.6235, 129: 0.9922, 130: 0.6235, 131: 0.1961, 154: 0.1882, 155: 0.9333, 156: 0.9882, 157: 0.9882, 158: 0.9882, 159: 0.9294, 181: 0.2118, 182: 0.8902, 183: 0.9922, 184: 0.9882, 185: 0.9373, 186: 0.9137, 187: 0.9882, 188: 0.2235, 189: 0.0235, 207: 0.0392, 208: 0.2353, 209: 0.8784, 210: 0.9882, 211: 0.9922, 212: 0.9882, 213: 0.7922, 214: 0.3294, 215: 0.9882, 216: 0.9922, 217: 0.4784, 235: 0.6392, 236: 0.9882, 237: 0.9882, 238: 0.9882, 239: 0.9922, 240: 0.9882, 241: 0.9882, 242: 0.3765, 243: 0.7412, 244: 0.9922, 245: 0.6549, 262: 0.2, 263: 0.9333, 264: 0.9922, 265: 0.9922, 266: 0.7451, 267: 0.4471, 268: 0.9922, 269: 0.8941, 270: 0.1843, 271: 0.3098, 272: 1.0, 273: 0.6588, 289: 0.1882, 290: 0.9333, 291: 0.9882, 292: 0.9882, 293: 0.702, 294: 0.0471, 295: 0.2941, 296: 0.4745, 297: 0.0824, 300: 0.9922, 301: 0.9529, 302: 0.1961, 316: 0.149, 317: 0.6471, 318: 0.9922, 319: 0.9137, 320: 0.8157, 321: 0.3294, 328: 0.9922, 329: 0.9882, 330: 0.6471, 343: 0.0