PARTE II - Machine Learning con datos en PostgreSQL:
 
Seleccionar un set de datos de su preferencia, que no corresponda a un conjunto trivial, haciendo uso de PostgreSQL y la librería MLlib llevar a cabo un análisis predictivo o clasificatorio, que resuelva algún problema en el conjunto de datos seleccionado.
El analisis de los datos se hizon con el set de datos del titanic.

Tabla en postgres

-- DROP TABLE public.titanic;

CREATE TABLE public.titanic
(
    passengerid integer,
    survived integer,
    pclass integer,
    "Name" text COLLATE pg_catalog."default",
    sex text COLLATE pg_catalog."default",
    age integer,
    sibsp integer,
    parch integer,
    ticket text COLLATE pg_catalog."default",
    fare integer,
    cabin text COLLATE pg_catalog."default",
    embarked text COLLATE pg_catalog."default"
)
WITH (
    OIDS = FALSE
)
TABLESPACE pg_default;

ALTER TABLE public.titanic
    OWNER to postgres;


Nota: Para realizar la conexión con la base de datos se agrego el driver de conexión de postgre en la siguiente ruta:
C:\Spark\jars

In [137]:
# Crear el spark session object, llamarle "supervised_ml"
spark=SparkSession.builder.appName('supervised_ml').getOrCreate()

In [138]:
# librerias de pyspark
import findspark
findspark.init('C:\Spark')

from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, udf 
from pyspark.sql.types import DateType

# Importacion de libs y operaciones

from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator


In [139]:
# configuracion del driver de conexión de postgres
spark = SparkSession \
    .builder \
    .appName("Basic JDBC pipeline") \
    .config("spark.driver.extraClassPath", "C:/Users/JuanPablo/BigDataTEC-master/Projecto/postgresql-42.2.9.jar") \
    .config("spark.executor.extraClassPath", "C:/Users/JuanPablo/BigDataTEC-master/Projecto/postgresql-42.2.9.jar") \
    .getOrCreate()

In [140]:
# parametros de conexión y lectura de tabla de la base de datos

df = spark \
    .read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost/BigData") \
    .option("user", "postgres") \
    .option("password", "admin") \
    .option("dbtable", "titanic") \
    .load()

df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+----+-----+--------+
|passengerid|survived|pclass|                Name|   sex| age|sibsp|parch|          ticket|fare|cabin|embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+----+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|  22|    1|    0|       A/5 21171|   7| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|  38|    1|    0|        PC 17599|  71|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|  26|    0|    0|STON/O2. 3101282|   7| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|  35|    1|    0|          113803|  53| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|  35|    0|    0|          373450|   8| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|          330877|   8| null|  

In [141]:
from pyspark.sql import SparkSession

df.printSchema()

root
 |-- passengerid: integer (nullable = true)
 |-- survived: integer (nullable = true)
 |-- pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- sibsp: integer (nullable = true)
 |-- parch: integer (nullable = true)
 |-- ticket: string (nullable = true)
 |-- fare: integer (nullable = true)
 |-- cabin: string (nullable = true)
 |-- embarked: string (nullable = true)



In [142]:
print((df.count(), len(df.columns))) 

(891, 12)


In [143]:
# primeros 10 datos
df.show(10)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+----+-----+--------+
|passengerid|survived|pclass|                Name|   sex| age|sibsp|parch|          ticket|fare|cabin|embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+----+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|  22|    1|    0|       A/5 21171|   7| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|  38|    1|    0|        PC 17599|  71|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|  26|    0|    0|STON/O2. 3101282|   7| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|  35|    1|    0|          113803|  53| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|  35|    0|    0|          373450|   8| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|          330877|   8| null|  

In [144]:
df.columns

['passengerid',
 'survived',
 'pclass',
 'Name',
 'sex',
 'age',
 'sibsp',
 'parch',
 'ticket',
 'fare',
 'cabin',
 'embarked']

In [145]:
my_cols = df.select(['Survived',
 'Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked'])

In [146]:
my_final_data = my_cols.na.drop()

Trabajamos con datos categoricos

In [147]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                OneHotEncoder,StringIndexer)

In [148]:
gender_indexer = StringIndexer(inputCol='Sex',outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex',outputCol='SexVec')

In [149]:
embark_indexer = StringIndexer(inputCol='Embarked',outputCol='EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkIndex',outputCol='EmbarkVec')

In [105]:
numeric_columns = ['age','sibsp','parch', 'fare']


uso de pipelines

In [150]:
from pyspark.ml import Pipeline

In [174]:
train, test = my_final_data.randomSplit([0.7,.3])


print(f"Size of train Dataset : {train.count()}" )
print(f"Size of test Dataset : {test.count()}" )

Size of train Dataset : 515
Size of test Dataset : 197


## Regresion Lineal

In [172]:
ln = LinearRegression(featuresCol='features',labelCol='Survived')

In [175]:
pipeline_ln = Pipeline(stages=[gender_indexer,embark_indexer,
                           gender_encoder,embark_encoder,
                           assembler,ln])

In [176]:
fit_model_ln = pipeline_ln.fit(train)

In [177]:
predictionRN = fit_model_ln.transform(test)

In [178]:
# visulizacion 
predictionRN.show(10)

+--------+------+------+---+-----+-----+----+--------+--------+-----------+-------------+-------------+--------------------+-------------------+
|Survived|Pclass|   Sex|Age|SibSp|Parch|Fare|Embarked|SexIndex|EmbarkIndex|       SexVec|    EmbarkVec|            features|         prediction|
+--------+------+------+---+-----+-----+----+--------+--------+-----------+-------------+-------------+--------------------+-------------------+
|       0|     1|female|  2|    1|    2| 151|       S|     1.0|        0.0|    (1,[],[])|(2,[0],[1.0])|[1.0,0.0,2.0,1.0,...|  1.131407282317133|
|       0|     1|female| 50|    0|    0|  28|       C|     1.0|        1.0|    (1,[],[])|(2,[1],[1.0])|(8,[0,2,5,7],[1.0...| 0.9131422850758324|
|       0|     1|  male| 18|    1|    0| 108|       C|     0.0|        1.0|(1,[0],[1.0])|(2,[1],[1.0])|[1.0,1.0,18.0,1.0...|  0.644042424666612|
|       0|     1|  male| 19|    3|    2| 263|       S|     0.0|        0.0|(1,[0],[1.0])|(2,[0],[1.0])|[1.0,1.0,19.0,3.0...|0.4034

In [181]:
# valor de R2
ln_evaluator = RegressionEvaluator(labelCol='Survived', metricName='r2')
ln_r2 = lr_evaluator.evaluate(predictionRN)

print(f'Regression Logistica is {ln_r2}')

Regression Logistica is 0.3512060488599934


In [182]:
# valor del meanSquaredError

lr_rmse = lr_evaluator.evaluate(predictionRL)
print(f'RMSE {lr_rmse}')

RMSE 0.020397208803005773


In [194]:
predictionRN.select('Survived','prediction').show()

+--------+-------------------+
|Survived|         prediction|
+--------+-------------------+
|       0|  1.131407282317133|
|       0| 0.9131422850758324|
|       0|  0.644042424666612|
|       0|0.40340751596188673|
|       0| 0.5914912669038905|
|       0| 0.6944346754541765|
|       0| 0.6676805637986138|
|       0|  0.650299072639789|
|       0| 0.5597345890999739|
|       0| 0.4402024310147731|
|       0|0.47157510892046073|
|       0| 0.4355882455192642|
|       0| 0.4010289977585272|
|       0|  0.323087963846455|
|       0|0.38751687486028696|
|       0| 0.3891167003011756|
|       0| 0.2902216641864499|
|       0| 0.3299836835836648|
|       0|0.26631296998750154|
|       0|0.32961541463924393|
+--------+-------------------+
only showing top 20 rows



## Regresion Logistica


In [151]:
lr = LogisticRegression(featuresCol='features',labelCol='Survived')

In [152]:
pipeline = Pipeline(stages=[gender_indexer,embark_indexer,
                           gender_encoder,embark_encoder,
                           assembler,lr])

In [164]:
fit_model = pipeline.fit(train)

In [165]:
predictionRL = fit_model.transform(test)

In [166]:
# visulizacion 
predictionRL.show(10)

+--------+------+------+---+-----+-----+----+--------+--------+-----------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|Survived|Pclass|   Sex|Age|SibSp|Parch|Fare|Embarked|SexIndex|EmbarkIndex|       SexVec|    EmbarkVec|            features|       rawPrediction|         probability|prediction|
+--------+------+------+---+-----+-----+----+--------+--------+-----------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|       0|     1|female| 25|    1|    2| 151|       S|     1.0|        0.0|    (1,[],[])|(2,[0],[1.0])|[1.0,0.0,25.0,1.0...|[-2.8558602408572...|[0.05437918325508...|       1.0|
|       0|     1|  male| 18|    1|    0| 108|       C|     0.0|        1.0|(1,[0],[1.0])|(2,[1],[1.0])|[1.0,1.0,18.0,1.0...|[-0.8029421189372...|[0.30939652312014...|       1.0|
|       0|     1|  male| 24|    0|    0|  79|       C|     0.0|        1.0|(1,[0],[1.0])|(2,[1],[1.0])|[1.0,1.

In [169]:
# valor de R2
lr_evaluator = RegressionEvaluator(labelCol='Survived', metricName='r2')
lr_r2 = lr_evaluator.evaluate(predictionRL)

print(f'Regression Logistica is {lr_r2}')

Regression Lineal is 0.020397208803005773


In [170]:
# valor del meanSquaredError

lr_rmse = lr_evaluator.evaluate(predictionRL)
print(f'RMSE {lr_rmse}')

RMSE 0.020397208803005773


In [158]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [159]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='Survived')

In [195]:
predictionRL.select('Survived','prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [207]:
AUC = my_eval.evaluate(predictionRL)

In [208]:
AUC

0.8427522349936137

### Regresión con Árboles de Decisión

In [188]:
# import lib
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.classification import RandomForestClassifier,DecisionTreeClassifier

In [189]:
# dec_tree
dec_tree = DecisionTreeClassifier(labelCol='Survived',featuresCol='features')


In [196]:
my_final_data.printSchema()

root
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Fare: integer (nullable = true)
 |-- Embarked: string (nullable = true)



In [212]:
pipelineTD = Pipeline(stages=[gender_indexer,embark_indexer,
                           gender_encoder,embark_encoder,
                           assembler,dec_tree])


In [213]:
fit_model_DT = pipelineTD.fit(train)

In [214]:
predictionDT = fit_model_DT.transform(test)

In [215]:
# visulizacion 
predictionRL.show(10)

+--------+------+------+---+-----+-----+----+--------+--------+-----------+-------------+-------------+--------------------+-------------------+
|Survived|Pclass|   Sex|Age|SibSp|Parch|Fare|Embarked|SexIndex|EmbarkIndex|       SexVec|    EmbarkVec|            features|         prediction|
+--------+------+------+---+-----+-----+----+--------+--------+-----------+-------------+-------------+--------------------+-------------------+
|       0|     1|female|  2|    1|    2| 151|       S|     1.0|        0.0|    (1,[],[])|(2,[0],[1.0])|[1.0,0.0,2.0,1.0,...|  1.131407282317133|
|       0|     1|female| 50|    0|    0|  28|       C|     1.0|        1.0|    (1,[],[])|(2,[1],[1.0])|(8,[0,2,5,7],[1.0...| 0.9131422850758324|
|       0|     1|  male| 18|    1|    0| 108|       C|     0.0|        1.0|(1,[0],[1.0])|(2,[1],[1.0])|[1.0,1.0,18.0,1.0...|  0.644042424666612|
|       0|     1|  male| 19|    3|    2| 263|       S|     0.0|        0.0|(1,[0],[1.0])|(2,[0],[1.0])|[1.0,1.0,19.0,3.0...|0.4034

In [244]:
# valor de R2
dt_evaluator = RegressionEvaluator(labelCol='Survived', metricName='r2')
dt_r2 = dt_evaluator.evaluate(predictionDT)

print(f'Arbol de desicion is {dt_r2}')

Arbol de desicion is 0.09844614729672219


In [245]:
# valor del meanSquaredError

dt_rmse = dt_evaluator.evaluate(predictionDT)
print(f'RMSE {dt_rmse}')

RMSE 0.09844614729672219


In [219]:
AUC = my_eval.evaluate(predictionDT)

In [220]:
AUC

0.7774052788420605

### Random Forest

In [246]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [247]:
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="Survived", featuresCol="features", numTrees=20)

In [248]:
pipelineRF = Pipeline(stages=[gender_indexer,embark_indexer,
                           gender_encoder,embark_encoder,
                           assembler,rf])

In [250]:
fit_model_RF = pipelineRF.fit(train)

In [251]:
predictionRF = fit_model_RF.transform(test)

In [252]:
predictionRF.printSchema()

root
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Fare: integer (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- SexIndex: double (nullable = false)
 |-- EmbarkIndex: double (nullable = false)
 |-- SexVec: vector (nullable = true)
 |-- EmbarkVec: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [253]:
predictionRF.select("prediction", "Survived", "features").show(5)

+----------+--------+--------------------+
|prediction|Survived|            features|
+----------+--------+--------------------+
|       1.0|       0|[1.0,0.0,2.0,1.0,...|
|       1.0|       0|(8,[0,2,5,7],[1.0...|
|       1.0|       0|[1.0,1.0,18.0,1.0...|
|       0.0|       0|[1.0,1.0,19.0,3.0...|
|       0.0|       0|[1.0,1.0,21.0,0.0...|
+----------+--------+--------------------+
only showing top 5 rows



In [254]:
# valor de R2
rf_evaluator = RegressionEvaluator(labelCol='Survived', metricName='r2')
rf_r2 = rf_evaluator.evaluate(predictionRF)

print(f'Randomw Forest {rf_r2}')


Randomw Forest 0.09844614729672219


In [255]:
# valor del meanSquaredError

rf_rmse = rf_evaluator.evaluate(predictionRF)
print(f'RMSE {rf_rmse}')

RMSE 0.09844614729672219


In [256]:
AUC = my_eval.evaluate(predictionRF)

In [257]:
AUC

0.7587803320561943

Fin