# Tutorial 1: Configuración y carga de datos en PySpark

##Paso 1: Configuración del entorno de PySpark en Colab

In [1]:
#Bibliotecas para poder trabajar con Spark
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.2.2/spark-3.2.2-bin-hadoop3.2.tgz
!tar xf spark-3.2.2-bin-hadoop3.2.tgz  
#Configuración de Spark con Python
!pip install -q findspark
!pip install pyspark

#Estableciendo variable de entorno
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.2-bin-hadoop3.2"

#Buscando e inicializando la instalación de Spark
import findspark
findspark.init()
findspark.find()

[33m0% [Working][0m            Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
[33m0% [Connecting to archive.ubuntu.com] [1 InRelease 14.2 kB/88.7 kB 16%] [Connec[0m                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease [1,581 B]
Get:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Get:5 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ Packages [85.6 kB]
Hit:6 http://archive.ubuntu.com/ubuntu bionic InRelease
Ign:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Get:10 http://security.

'/content/spark-3.2.2-bin-hadoop3.2'

# Tutorial 6: Usando ML en PySpark

In [2]:
from pyspark.sql import SparkSession

df_ml = SparkSession.builder.appName('EjemploML').getOrCreate()
df_ml

In [3]:
#Cargamos la información en un DataFrame
training_dataset  = df_ml.read.csv('UserCarDataExample.csv', header=True, inferSchema=True)
training_dataset

DataFrame[age: int, selling_price: int, km_driven: int, mileage: double, engine: int, max_power: double, seats: int]

In [4]:
#Mostramos la información
training_dataset.show()

+---+-------------+---------+-------+------+---------+-----+
|age|selling_price|km_driven|mileage|engine|max_power|seats|
+---+-------------+---------+-------+------+---------+-----+
|  8|       450000|   145500|   23.4|  1248|     74.0|    5|
|  8|       370000|   120000|  21.14|  1498|   103.52|    5|
| 16|       158000|   140000|   17.7|  1497|     78.0|    5|
| 12|       225000|   127000|   23.0|  1396|     90.0|    5|
| 15|       130000|   120000|   16.1|  1298|     88.2|    5|
|  5|       440000|    45000|  20.14|  1197|    81.86|    5|
| 15|        96000|   175000|   17.3|  1061|     57.5|    5|
| 21|        45000|     5000|   16.1|   796|     37.0|    4|
| 11|       350000|    90000|  23.59|  1364|     67.1|    5|
|  9|       200000|   169000|   20.0|  1399|     68.1|    5|
|  8|       500000|    68000|  19.01|  1461|   108.45|    5|
| 17|        92000|   100000|   17.3|   993|     60.0|    5|
| 13|       280000|   140000|   19.3|  1248|     73.9|    5|
| 13|       180000|    9

In [5]:
#Visualizamos el esquema de la base de datos
training_dataset.printSchema()

root
 |-- age: integer (nullable = true)
 |-- selling_price: integer (nullable = true)
 |-- km_driven: integer (nullable = true)
 |-- mileage: double (nullable = true)
 |-- engine: integer (nullable = true)
 |-- max_power: double (nullable = true)
 |-- seats: integer (nullable = true)



In [6]:
#Visualizamos el nombre de las columnas
training_dataset.columns

['age',
 'selling_price',
 'km_driven',
 'mileage',
 'engine',
 'max_power',
 'seats']

Para trabajar con modelos de regresión tenemos que utilizar *VectorAssembler* para convertir las variables independientes en un vector que las incluya

In [7]:
from pyspark.ml.feature import VectorAssembler

featassembler = VectorAssembler(inputCols=['age',
 'km_driven',
 'mileage',
 'engine',
 'max_power',
 'seats'], outputCol = "Independent Features" )
featassembler

VectorAssembler_5b3b1a0f82e0

Posteriormente se integran al conjunto de datos que ya estaba cargado utilizando la función *transform()*.

In [8]:
result = featassembler.transform(training_dataset)
result.show()

+---+-------------+---------+-------+------+---------+-----+--------------------+
|age|selling_price|km_driven|mileage|engine|max_power|seats|Independent Features|
+---+-------------+---------+-------+------+---------+-----+--------------------+
|  8|       450000|   145500|   23.4|  1248|     74.0|    5|[8.0,145500.0,23....|
|  8|       370000|   120000|  21.14|  1498|   103.52|    5|[8.0,120000.0,21....|
| 16|       158000|   140000|   17.7|  1497|     78.0|    5|[16.0,140000.0,17...|
| 12|       225000|   127000|   23.0|  1396|     90.0|    5|[12.0,127000.0,23...|
| 15|       130000|   120000|   16.1|  1298|     88.2|    5|[15.0,120000.0,16...|
|  5|       440000|    45000|  20.14|  1197|    81.86|    5|[5.0,45000.0,20.1...|
| 15|        96000|   175000|   17.3|  1061|     57.5|    5|[15.0,175000.0,17...|
| 21|        45000|     5000|   16.1|   796|     37.0|    4|[21.0,5000.0,16.1...|
| 11|       350000|    90000|  23.59|  1364|     67.1|    5|[11.0,90000.0,23....|
|  9|       2000

Para construir nuestro modelo de regresión debemos selecccionar la columna que integró los valores de las columnas en un vector y la columna que representa la variable dependiente.

In [None]:
final_data = result.select("Independent features", "selling_price")
final_data.show()

Del conjunto total de datos se puede generar un división de conjunto de datos entre entrenamiento y prueba con la función *randomSplit*.

In [10]:
train_data, test_data = final_data.randomSplit([0.75, 0.25])

Ahora bien hay que importar *LinearRegression* de la biblioteca de machine learning de PySpark. Especificando cuales son la variables independientes y cual es la dependiente.

In [11]:
from pyspark.ml.regression import LinearRegression

model = LinearRegression(featuresCol = 'Independent features', labelCol='selling_price')
model = model.fit(train_data)

Podemos imprimir la matriz de correlación para verificar la congruencia del modelo.

In [12]:
from pyspark.ml.stat import Correlation

matrix = Correlation.corr(final_data, 'Independent features')
cor_np = matrix.collect()[0][matrix.columns[0]].toArray()
cor_np



array([[ 1.        ,  0.42854848, -0.32854385, -0.0182631 , -0.2265978 ,
         0.00792303],
       [ 0.42854848,  1.        , -0.17298035,  0.20603073, -0.03815852,
         0.22725939],
       [-0.32854385, -0.17298035,  1.        , -0.57640787, -0.37462089,
        -0.45170047],
       [-0.0182631 ,  0.20603073, -0.57640787,  1.        ,  0.70397453,
         0.61110339],
       [-0.2265978 , -0.03815852, -0.37462089,  0.70397453,  1.        ,
         0.19199918],
       [ 0.00792303,  0.22725939, -0.45170047,  0.61110339,  0.19199918,
         1.        ]])

Obtener el valor del intercepto.

In [13]:
model.intercept

-391341.2508098713

Los coeficientes por cada variable independiente

In [14]:
model.coefficients

DenseVector([-39812.2156, -1.3117, 11366.3369, 81.6412, 15732.9486, -60943.5004])

Así como los p-values para determinar la transendencia de cada variable dentro del modelo.

In [15]:
model.summary.pValues

[0.0,
 0.0,
 2.3694052520006892e-07,
 0.0017050365053692396,
 0.0,
 1.4776824208695416e-10,
 6.6494779240589e-06]

Obtener indicadores de desempeño como la $r^2$ ajustada, dado que es un problema multivariado. Que nos sirve para indicar el porcentaje de la variabilidad de la variable dependiente explicada por el modelo.

In [16]:
model.summary.r2adj

0.6283657674003497

Podemos realizar predicciones para evaluar el modelo obtenido.

In [17]:
prediction_result = model.evaluate(test_data)
prediction_result.predictions.show()



+--------------------+-------------+------------------+
|Independent features|selling_price|        prediction|
+--------------------+-------------+------------------+
|[2.0,1500.0,21.21...|       730000| 848108.9959759903|
|[2.0,5000.0,17.0,...|       950000|1215817.2007647231|
|[2.0,5000.0,17.8,...|      1900000|1326085.8682198024|
|[2.0,5000.0,20.89...|       600000| 839880.8817124309|
|[2.0,5000.0,21.21...|       600000| 843518.1095263043|
|[2.0,5000.0,22.05...|       350000|277540.98947731784|
|[2.0,5500.0,26.8,...|      2125000|1508589.3065284884|
|[2.0,6000.0,23.95...|       390000| 625042.6026678474|
|[2.0,7400.0,24.3,...|       925000| 985066.5104603015|
|[2.0,10000.0,18.7...|       737000|1168376.1665771496|
|[2.0,15000.0,15.3...|       415000|  625490.294346991|
|[2.0,15000.0,24.3...|       810000|  975097.728455269|
|[2.0,20000.0,23.2...|       700000| 952554.6284031046|
|[2.0,20102.0,13.6...|      1560000|1686744.9841818397|
|[2.0,30000.0,22.0...|       250999|244748.94340

Mostrar algunos indicadores de desempeño utiles.

In [18]:
prediction_result.meanAbsoluteError, prediction_result.meanSquaredError

(290596.03928948863, 238375152290.37976)

#Tutorial 7: Utilizando MLlib en PySpark

MLlib es la biblioteca de aprendizaje automático (ML) de Spark. Su objetivo es hacer que el aprendizaje automático práctico sea escalable y fácil a un alto nivel. La biblioteca proporciona herramientas como:

Algoritmos de ML: algoritmos de aprendizaje comunes como clasificación, regresión, agrupamiento y filtrado colaborativo.
Características: extracción de características, transformación, reducción de dimensionalidad y selección
Pipelines: herramientas para construir, evaluar y ajustar ML Pipelines
Persistencia: guardar y cargar algoritmos, modelos y Pipelines
Utilidades: álgebra lineal, estadística, manejo de datos, etc.

In [None]:
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.tree import DecisionTreeModel

from pyspark import SparkContext

sc = SparkContext.getOrCreate()

In [None]:
from pyspark.mllib.util import MLUtils

data = MLUtils.loadLibSVMFile(sc, 'spark-3.2.2-bin-hadoop3.2/data/mllib/sample_libsvm_data.txt')


In [None]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [None]:
data.take(10)

[LabeledPoint(0.0, (692,[127,128,129,130,131,154,155,156,157,158,159,181,182,183,184,185,186,187,188,189,207,208,209,210,211,212,213,214,215,216,217,235,236,237,238,239,240,241,242,243,244,245,262,263,264,265,266,267,268,269,270,271,272,273,289,290,291,292,293,294,295,296,297,300,301,302,316,317,318,319,320,321,328,329,330,343,344,345,346,347,348,349,356,357,358,371,372,373,374,384,385,386,399,400,401,412,413,414,426,427,428,429,440,441,442,454,455,456,457,466,467,468,469,470,482,483,484,493,494,495,496,497,510,511,512,520,521,522,523,538,539,540,547,548,549,550,566,567,568,569,570,571,572,573,574,575,576,577,578,594,595,596,597,598,599,600,601,602,603,604,622,623,624,625,626,627,628,629,630,651,652,653,654,655,656,657],[51.0,159.0,253.0,159.0,50.0,48.0,238.0,252.0,252.0,252.0,237.0,54.0,227.0,253.0,252.0,239.0,233.0,252.0,57.0,6.0,10.0,60.0,224.0,252.0,253.0,252.0,202.0,84.0,252.0,253.0,122.0,163.0,252.0,252.0,252.0,253.0,252.0,252.0,96.0,189.0,253.0,167.0,51.0,238.0,253.0,253.0,190.0

In [None]:
numClasses = 2
categoricalFeaturesInfo = {}
impurity = "gini"

model = DecisionTree.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo,
  impurity)

In [None]:
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(
    lambda lp: lp[0] != lp[1]).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())

Test Error = 0.03225806451612903
Learned classification tree model:
DecisionTreeModel classifier of depth 2 with 5 nodes
  If (feature 406 <= 126.5)
   If (feature 100 <= 193.5)
    Predict: 0.0
   Else (feature 100 > 193.5)
    Predict: 1.0
  Else (feature 406 > 126.5)
   Predict: 1.0



In [None]:
model.save(sc, "myDecisionTreeClassificationModel.dt")
sameModel = DecisionTreeModel.load(sc, "myDecisionTreeClassificationModel.dt")