# Intro ML SUP en BD

Creación de la sesión Spark:

In [1]:
#import SparkSession
import findspark
findspark.init('C:/Spark/spark-2.4.4-bin-hadoop2.7')
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, udf 
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [2]:
# Crear el spark session object, llamarle "supervised_ml"

supervised_ml=SparkSession.builder.appName('App_supervised_ml').getOrCreate()


## Regression 

Carga de datos, archivo *Linear_regression_dataset.csv*:

In [3]:
# Carga de datos
df=supervised_ml.read.csv("Linear_regression_dataset.csv",header=True, inferSchema=True)


Se invocan las librerias correcpondientes a **LinearRegression**, asi como las de OneHotEncoder, StringIndexer, VectorAssembler:

In [4]:
# Importacion de libs y operaciones
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.feature import VectorAssembler 



Se visualizan algunos datos:

In [5]:
print((df.count(), len(df.columns))) 

(1232, 6)


Se muestran los primeros 10 datos:

In [6]:
# primeros 10 datos

df.show(10)


+-----+-----+-----+-----+-----+-----+
|var_1|var_2|var_3|var_4|var_5|label|
+-----+-----+-----+-----+-----+-----+
|  734|  688|   81|0.328|0.259|0.418|
|  700|  600|   94| 0.32|0.247|0.389|
|  712|  705|   93|0.311|0.247|0.417|
|  734|  806|   69|0.315| 0.26|0.415|
|  613|  759|   61|0.302| 0.24|0.378|
|  748|  676|   85|0.318|0.255|0.422|
|  669|  588|   97|0.315|0.251|0.411|
|  667|  845|   68|0.324|0.251|0.381|
|  758|  890|   64| 0.33|0.274|0.436|
|  726|  670|   88|0.335|0.268|0.422|
+-----+-----+-----+-----+-----+-----+
only showing top 10 rows



## Feature Engineering

Creamos un solo vector con todos los features i.e 'var_1', 'var_2', 'var_3', 'var_4', 'var_5', a este le llamaremos "features" y como salida colocamos a 'label':

In [7]:
# Vector Ensamblador


#vectorAssembler = VectorAssembler(inputCols = ['var_1', 'var_2', 'var_3', 'var_4', 'var_5'], outputCol = 'features')
#vhouse_df = vectorAssembler.transform(df)
#vhouse_df = vhouse_df.select(['features', 'label'])
#vhouse_df.show(3)

vectorAssembler = VectorAssembler(inputCols = ['var_1', 'var_2', 'var_3', 'var_4', 'var_5'], outputCol = 'features')
vdf = vectorAssembler.transform(df)
vdf = vdf.select(['features', 'label'])
vdf.show(10)



+--------------------+-----+
|            features|label|
+--------------------+-----+
|[734.0,688.0,81.0...|0.418|
|[700.0,600.0,94.0...|0.389|
|[712.0,705.0,93.0...|0.417|
|[734.0,806.0,69.0...|0.415|
|[613.0,759.0,61.0...|0.378|
|[748.0,676.0,85.0...|0.422|
|[669.0,588.0,97.0...|0.411|
|[667.0,845.0,68.0...|0.381|
|[758.0,890.0,64.0...|0.436|
|[726.0,670.0,88.0...|0.422|
+--------------------+-----+
only showing top 10 rows



In [8]:
# visulizacion de vector ensamblado compuesto por features y label

vdf.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[734.0,688.0,81.0...|0.418|
|[700.0,600.0,94.0...|0.389|
|[712.0,705.0,93.0...|0.417|
|[734.0,806.0,69.0...|0.415|
|[613.0,759.0,61.0...|0.378|
|[748.0,676.0,85.0...|0.422|
|[669.0,588.0,97.0...|0.411|
|[667.0,845.0,68.0...|0.381|
|[758.0,890.0,64.0...|0.436|
|[726.0,670.0,88.0...|0.422|
|[583.0,794.0,55.0...|0.371|
|[676.0,746.0,72.0...|  0.4|
|[767.0,699.0,89.0...|0.433|
|[637.0,597.0,86.0...|0.374|
|[609.0,724.0,69.0...|0.382|
|[776.0,733.0,83.0...|0.437|
|[701.0,832.0,66.0...| 0.39|
|[650.0,709.0,74.0...|0.386|
|[804.0,668.0,95.0...|0.453|
|[713.0,614.0,94.0...|0.404|
+--------------------+-----+
only showing top 20 rows



Partimos a continuación el set de datos en 75% training y 25% testing:

In [9]:
# Particion del data set

splits = vdf.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]


print(f"Size of train Dataset : {train_df.count()}" )
print(f"Size of test Dataset : {test_df.count()}" )

Size of train Dataset : 873
Size of test Dataset : 359


Creamos el Regresor Lineal: 

In [10]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol = 'features', labelCol='label', maxIter=10, regParam=0.3, elasticNetParam=0.8)




Entrenamos el modelo de regresión lineal:

In [11]:
# Fit the model, le llamamos lr_model

lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))


Coefficients: [0.0,0.0,0.0,0.0,0.0]
Intercept: 0.39681557846506255


Creamos el dataframe de prediciones (*predictions_df*) a partir del modelo de entrenamiento y el conjunto de datos test: 

In [12]:
predictions_df = lr_model.transform(test_df)

Visualizamos el contenido de *predictions_df*:

In [13]:
# visulizacion de predictions_df

predictions_df.show()



+--------------------+-----+-------------------+
|            features|label|         prediction|
+--------------------+-----+-------------------+
|[468.0,746.0,52.0...|0.329|0.39681557846506255|
|[498.0,615.0,67.0...|0.318|0.39681557846506255|
|[511.0,576.0,76.0...|0.329|0.39681557846506255|
|[522.0,621.0,72.0...|0.317|0.39681557846506255|
|[533.0,660.0,62.0...| 0.33|0.39681557846506255|
|[541.0,830.0,60.0...| 0.33|0.39681557846506255|
|[552.0,683.0,71.0...|0.335|0.39681557846506255|
|[556.0,675.0,67.0...|0.348|0.39681557846506255|
|[559.0,613.0,75.0...|0.359|0.39681557846506255|
|[569.0,544.0,82.0...|0.343|0.39681557846506255|
|[569.0,776.0,53.0...|0.348|0.39681557846506255|
|[570.0,662.0,73.0...|0.337|0.39681557846506255|
|[573.0,717.0,62.0...|0.345|0.39681557846506255|
|[575.0,864.0,55.0...|0.379|0.39681557846506255|
|[578.0,733.0,62.0...|0.348|0.39681557846506255|
|[579.0,655.0,71.0...|0.357|0.39681557846506255|
|[581.0,724.0,64.0...|0.346|0.39681557846506255|
|[585.0,755.0,59.0..

Ahora, evaluamos el modelo de Regresión Lineal, con los datos de TEST:

In [14]:
# evaluacion del modelo, le llamaremos model_predictions
from pyspark.ml.evaluation import RegressionEvaluator

model_predictions = lr_model.transform(test_df)

model_predictions.select("prediction","label","features").show(5)

lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="r2")


+-------------------+-----+--------------------+
|         prediction|label|            features|
+-------------------+-----+--------------------+
|0.39681557846506255|0.329|[468.0,746.0,52.0...|
|0.39681557846506255|0.318|[498.0,615.0,67.0...|
|0.39681557846506255|0.329|[511.0,576.0,76.0...|
|0.39681557846506255|0.317|[522.0,621.0,72.0...|
|0.39681557846506255| 0.33|[533.0,660.0,62.0...|
+-------------------+-----+--------------------+
only showing top 5 rows



Imprimimos el valor de R2:

In [15]:
# valor de R2

print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(model_predictions))


R Squared (R2) on test data = -0.00313915


Imprimimos el valor del meanSquaredError:

In [16]:
# valor del meanSquaredError

test_result = lr_model.evaluate(test_df)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)


Root Mean Squared Error (RMSE) on test data = 0.032277


## Regresión con Árboles de Decisión

Importamos la librería *DecisionTreeRegressor*: 

In [17]:
# import lib

from pyspark.ml.regression import DecisionTreeRegressor


Creamos el Regresor DT, le llamaremos *dec_tree*:

In [18]:
# dec_tree

dec_tree = DecisionTreeRegressor(featuresCol ='features', labelCol = 'label')

Entrenamos el modelo:

In [19]:
# Train model, le llamaremos dec_tree_model

dec_tree_model = dec_tree.fit(train_df)
dt_predictions = dec_tree_model.transform(test_df)


Cuánto es la profundidad máxima por defecto, de este algoritmo?

R/ 

Desplegamos las *featureImportances*:

In [20]:
dec_tree_model.featureImportances

SparseVector(5, {0: 0.9703, 1: 0.0099, 2: 0.0031, 3: 0.006, 4: 0.0108})

Evaluamos el modelo con los datos de entrenamiento:

In [21]:
# Make predictions, le llamaremos model_predictions 

model_predictions = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = model_predictions.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)


Root Mean Squared Error (RMSE) on test data = 0.0136859


In [23]:
# visualizamos
model_predictions

RegressionEvaluator_4e9495743252

Importamos el **RegressionEvaluator**

In [24]:
# import Evaluator

RegressionEvaluator()


RegressionEvaluator_4151334edaa9

Usando *RegressionEvaluator* calculamos e imprimimos el valor de las metricas R2 y RMSE:

In [26]:
# R2 value of the model on test data 
dt_evaluator = RegressionEvaluator(metricName='r2')
dt_r2 = dt_evaluator.evaluate(model_predictions)
print(f'The r-square value of DecisionTreeRegressor is {dt_r2}')

# RMSE value of the model on test data 




AttributeError: 'RegressionEvaluator' object has no attribute '_jdf'

## RandomForestRegressor

Importamos a *RandomForestRegressor*

In [25]:
# import lib

from pyspark.ml.regression import RandomForestRegressor




Creamos el Regresor RF:

In [None]:
# Regresor 
rf = RandomForestRegressor(labelCol="label", featuresCol="features")



Entrenamos el modelo:

In [None]:
# Train model, le llamaremos rf_model



Desplegamos las *featureImportances*:

In [None]:
# importances 




Desplegamos el numero de arboles (Num of Trees)

In [None]:
# Numero de Trees




Evaluamos el modelo con los datos de entrenamiento, le llamaremos model_predictions:


In [None]:
# model_predictions




Desplegamos los valores del *model_predictions*

In [None]:
model_predictions.show()

Usando *RegressionEvaluator* calculamos e imprimimos el valor de las metricas R2 y RMSE:

In [None]:
# R2 value of the model on test data 





# RMSE value of the model on test data 





## Gradient-Boosted Tree Regressor

Importamos a GBTRegressor


In [48]:
# import

from pyspark.ml.regression import GBTRegressor


Creamos el Regresor GBTR, le llamaremos gbt:


In [49]:
# regresor

gbt = GBTRegressor(featuresCol = 'features', labelCol = 'label', maxIter=10)



Entrenamos el modelo:

In [50]:
# Train model, le llamaremos gbt_model

gbt_model = gbt.fit(train_df)


Desplegamos las featureImportances:

In [51]:
#Importances

gbt_model.featureImportances

SparseVector(5, {0: 0.289, 1: 0.178, 2: 0.2122, 3: 0.1366, 4: 0.1842})

Evaluamos el modelo con los datos de entrenamiento, le llamaremos model_predictions:

In [52]:
# Model

model_predictions = gbt_model.transform(test_df)
model_predictions.select('prediction', 'label', 'features').show(5)





+-------------------+-----+--------------------+
|         prediction|label|            features|
+-------------------+-----+--------------------+
| 0.3229654363045081|0.319|[470.0,509.0,76.0...|
|0.32130134823534245|0.318|[498.0,615.0,67.0...|
| 0.3237376066332866|0.325|[498.0,672.0,61.0...|
|0.33176672221789794|0.317|[510.0,588.0,72.0...|
| 0.3237376066332866|0.339|[513.0,698.0,61.0...|
+-------------------+-----+--------------------+
only showing top 5 rows



Desplegamos los valores del *model_predictions*

In [53]:
# show 

model_predictions.show()


+--------------------+-----+-------------------+
|            features|label|         prediction|
+--------------------+-----+-------------------+
|[470.0,509.0,76.0...|0.319| 0.3229654363045081|
|[498.0,615.0,67.0...|0.318|0.32130134823534245|
|[498.0,672.0,61.0...|0.325| 0.3237376066332866|
|[510.0,588.0,72.0...|0.317|0.33176672221789794|
|[513.0,698.0,61.0...|0.339| 0.3237376066332866|
|[527.0,569.0,75.0...|0.341| 0.3323506642270911|
|[528.0,652.0,71.0...|0.319|0.32996615015463343|
|[531.0,491.0,89.0...| 0.32|0.32658471665728805|
|[533.0,660.0,62.0...| 0.33| 0.3241382055119172|
|[534.0,609.0,69.0...|0.329|0.33176672221789794|
|[536.0,531.0,83.0...|0.318| 0.3237840499906214|
|[541.0,830.0,60.0...| 0.33|0.32471341450908076|
|[543.0,615.0,76.0...|0.333|  0.333599944561744|
|[543.0,747.0,60.0...|0.342| 0.3250048849062326|
|[550.0,631.0,76.0...|0.318| 0.3317993724984795|
|[550.0,789.0,54.0...|0.359| 0.3253986302674275|
|[552.0,683.0,71.0...|0.335|0.35170174482997885|
|[556.0,674.0,62.0..

Usando RegressionEvaluator calculamos e imprimimos el valor de las metricas R2 y RMSE:

In [55]:
 #Select (prediction, true label) and compute test error
# R2 value of the model on test data 

 gbt_evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")


# RMSE value of the model on test data 

rmse = gbt_evaluator.evaluate(model_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)



Root Mean Squared Error (RMSE) on test data = 0.0140442


 ## Exploracion de datos...

Usaremos el dataset https://archive.ics.uci.edu/ml/datasets/Bank+Marketing 

Indique a grandes razgos de que se trata este dataset:

El dataet contiene datos relacionados con campañas publicitarias realizada vía telefónica, para una institución bancaria portuguesa. El objetivo principal de la clasificación es predecir si el cliente se suscribe o no a depósitos a plazo (variable target_class).

En el link dice que contiene  17 atributos, sin embargo al ver el dataset tiene 20, algunos de ellos son: edad, tipo de trabajo, estado civil, nivel de educación, tiene crédito en incumplimiento, tiene préstamo de vivienda, tiene préstamo personal, tipo de contacto, último mes que se le contacto, día de la semana del último contacto, duración en segundos del último contacto y otros.

Carga de datos, archivo bank_data.csv:


In [3]:
# Load csv Dataset 
spark=SparkSession.builder.appName('App_llamadasBancarias').getOrCreate()

df=spark.read.csv('bank_data.csv',inferSchema=True,header=True)

Determine la cantidad de datos en el dataset:

In [4]:
#number of records


print("Cantidad de registros: ", df.count(), " y la cantidad de columnas: ", len(df.columns)) 


Cantidad de registros:  41188  y la cantidad de columnas:  21


A que dato corresponde cada columna?

In [18]:
# columns values 
#en esta no estoy clara que desea que ejecutemos, así que se incluye: describe, dtypes y take

df.describe()


DataFrame[summary: string, age: string, job: string, marital: string, education: string, default: string, housing: string, loan: string, contact: string, month: string, day_of_week: string, duration: string, campaign: string, pdays: string, previous: string, poutcome: string, emp.var.rate: string, cons.price.idx: string, cons.conf.idx: string, euribor3m: string, nr.employed: string, target_class: string]

In [19]:
df.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('month', 'string'),
 ('day_of_week', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('emp.var.rate', 'double'),
 ('cons.price.idx', 'double'),
 ('cons.conf.idx', 'double'),
 ('euribor3m', 'double'),
 ('nr.employed', 'double'),
 ('target_class', 'string')]

In [23]:
df.take(1)

[Row(age=56, job='housemaid', marital='married', education='basic.4y', default='no', housing='no', loan='no', contact='telephone', month='may', day_of_week='mon', duration=261, campaign=1, pdays=999, previous=0, poutcome='nonexistent', emp.var.rate=1.1, cons.price.idx=93.994, cons.conf.idx=-36.4, euribor3m=4.857, nr.employed=5191.0, target_class='no')]

Imprima el Schema:

In [9]:
#dataype of input data - Schema

df.printSchema()


root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- emp.var.rate: double (nullable = true)
 |-- cons.price.idx: double (nullable = true)
 |-- cons.conf.idx: double (nullable = true)
 |-- euribor3m: double (nullable = true)
 |-- nr.employed: double (nullable = true)
 |-- target_class: string (nullable = true)



En cuanto a la salida, como es la distrubución de clases?

In [29]:
# YES/NO Class Distribution
 
df.groupBy('target_class').count().show()


+------------+-----+
|target_class|count|
+------------+-----+
|          no|36548|
|         yes| 4640|
+------------+-----+



Una tarea típica, resulta de convertir los valores binarios en 1 y 0, usando como referencia "label", convierta los no/yes en 0/1:

In [32]:
from pyspark.sql import functions as F
from pyspark.sql import *
from pyspark.sql.functions import regexp_replace,col

In [37]:
# Ingrese acá la instrucción: 

df_Nuevo = df.withColumn('target_class', regexp_replace('target_class', 'no', '0'))
df_Nuevo = df_Nuevo.withColumn('target_class', regexp_replace('target_class', 'yes', '1'))


In [38]:
# New 1/0 Class Distribution
 
df_Nuevo.groupBy('target_class').count().show()


+------------+-----+
|target_class|count|
+------------+-----+
|           0|36548|
|           1| 4640|
+------------+-----+



A continuación se presenta un ejercicio de Deep Learning para su revisión...

# Deep Learning 

Importamos las librerias necesarias:

In [None]:
import os
import numpy as np
import pandas as pd
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.sql import functions as f
from pyspark.sql.functions import udf, StringType
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer

Inicializamos la sesion SPARK:

In [None]:
spark = SparkSession.builder.appName('deep_learning').getOrCreate()

Leemos el dataset:

In [None]:
data = spark.read.csv('dl_data.csv', header=True, inferSchema=True)

In [None]:
data.printSchema()

Renombramos la columna TARGET:

In [None]:
data = data.withColumnRenamed('Orders_Normalized', 'label')

In [None]:
data.printSchema()

Partimos lo datos en Train, Validation y Test:

In [None]:
train, validation, test  = data.randomSplit([0.7, 0.2, 0.1], 1234)

Construimos el Pipeline

In [None]:
categorical_columns = [item[0] for item in data.dtypes if item[1].startswith('string')]
numeric_columns = [item[0] for item in data.dtypes if item[1].startswith('double')]

indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(column)) for column in categorical_columns]

featuresCreator = VectorAssembler(inputCols=[indexer.getOutputCol() for indexer in indexers] + numeric_columns, outputCol="features")

layers = [len(featuresCreator.getInputCols()), 4, 2, 2]

classifier = MultilayerPerceptronClassifier(labelCol='label', featuresCol='features', maxIter=100, layers=layers, blockSize=128, seed=1234)

pipeline = Pipeline(stages=indexers + [featuresCreator, classifier])

Entrenamos...

In [None]:
model = pipeline.fit(train)

Validamos y Evaluamos

In [None]:
train_output_df = model.transform(train)
validation_output_df = model.transform(validation)
test_output_df = model.transform(test)

Llevamos a cabo, algunas predicciones:

In [27]:
train_predictionAndLabels = train_output_df.select("prediction", "label")
validation_predictionAndLabels = validation_output_df.select("prediction", "label")
test_predictionAndLabels = test_output_df.select("prediction", "label")

metrics = ['weightedPrecision', 'weightedRecall', 'accuracy']

for metric in metrics:
    evaluator = MulticlassClassificationEvaluator(metricName=metric)
    print('Train ' + metric + ' = ' + str(evaluator.evaluate(train_predictionAndLabels)))
    print('Validation ' + metric + ' = ' + str(evaluator.evaluate(validation_predictionAndLabels)))
    print('Test ' + metric + ' = ' + str(evaluator.evaluate(test_predictionAndLabels)))

NameError: name 'train_output_df' is not defined

Puede mejorar el test accuracy del modelo variando alguno de los hyperparametros?