# Redes Neuronales

In [1]:
#from pyspark import SparkContext
#sc = SparkContext()
#from pyspark.sql import SQLContext
#sqlContext=SQLContext(sc)

In [2]:
bd5 = sqlContext.read.format(
    "com.databricks.spark.csv"
).option("header", "true").load("bd5.csv", inferSchema=True)
sqlContext.registerDataFrameAsTable(bd5, "bd5")

In [3]:
bd5.dtypes

[('YEAR', 'int'),
 ('MONTH', 'int'),
 ('DAY_OF_MONTH', 'int'),
 ('DAY_OF_WEEK', 'int'),
 ('CRS_DEP_TIME', 'int'),
 ('OP_UNIQUE_CARRIER', 'string'),
 ('TAIL_NUM', 'string'),
 ('ARR_DELAY', 'double'),
 ('DEP_DELAY', 'double'),
 ('ORIGIN', 'string'),
 ('DEST', 'string'),
 ('DISTANCE', 'double'),
 ('CANCELLED', 'double'),
 ('DIVERTED', 'double'),
 ('CARRIER_DELAY', 'double'),
 ('WEATHER_DELAY', 'double'),
 ('NAS_DELAY', 'double'),
 ('SECURITY_DELAY', 'double'),
 ('LATE_AIRCRAFT_DELAY', 'double'),
 ('LogD', 'double'),
 ('Retraso', 'int'),
 ('RetrasoNeto', 'double'),
 ('Horario', 'int')]

In [4]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='OP_UNIQUE_CARRIER',outputCol='IndexUniqueCarrier') #el índice empieza en el 0!
bd6=indexer.fit(bd5).transform(bd5)

## Multilayer Perceptron Classifier

In [5]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col

a1  = VectorAssembler(
    inputCols=['DEP_DELAY','DISTANCE','DAY_OF_WEEK',
               'CRS_DEP_TIME','IndexUniqueCarrier'],
    outputCol='features')

bd7 = a1.transform(bd6).select(col("Retraso").cast('double').alias("label"),'features')

## Partición Test - Train

In [6]:
(bd_train, bd_test) = bd7.randomSplit([0.7, 0.3],seed=123)
print(bd_train.count())
print(bd_test.count())

21278
9188


In [7]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

mlp = MultilayerPerceptronClassifier(labelCol="label",
      featuresCol="features", 
      maxIter=100, 
      layers=[5, 5, 2], 
      seed=123)


* El numéro de Neuronas de la 1a capa = al número de elementos feature
* El numéro de Neuronas de la última capa = al número de labels
* Las neuronas internas tienen función de activación sigmoide 
* Las neuronas de la última capa tienen función de activación softmax

In [8]:
model = mlp.fit(bd_train)

pred = model.transform(bd_train)

In [9]:
model.weights.size

42

In [10]:
pred.show()

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|  0.0|[-20.0,602.0,3.0,...|       0.0|
|  0.0|[-16.0,641.0,1.0,...|       0.0|
|  0.0|[-16.0,641.0,6.0,...|       0.0|
|  0.0|[-16.0,868.0,6.0,...|       0.0|
|  0.0|[-16.0,888.0,3.0,...|       0.0|
|  0.0|[-15.0,731.0,1.0,...|       0.0|
|  0.0|[-15.0,888.0,4.0,...|       0.0|
|  0.0|[-15.0,888.0,5.0,...|       0.0|
|  0.0|[-15.0,967.0,3.0,...|       0.0|
|  0.0|[-15.0,1464.0,6.0...|       0.0|
|  0.0|[-14.0,236.0,3.0,...|       0.0|
|  0.0|[-14.0,236.0,4.0,...|       0.0|
|  0.0|[-14.0,337.0,1.0,...|       0.0|
|  0.0|[-14.0,337.0,1.0,...|       0.0|
|  0.0|[-14.0,337.0,1.0,...|       0.0|
|  0.0|[-14.0,337.0,2.0,...|       0.0|
|  0.0|[-14.0,414.0,2.0,...|       0.0|
|  0.0|[-14.0,628.0,1.0,...|       0.0|
|  0.0|[-14.0,731.0,5.0,...|       0.0|
|  0.0|[-14.0,868.0,3.0,...|       0.0|
+-----+--------------------+----------+
only showing top 20 rows



In [11]:
pred.groupBy('label','prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0| 4383|
|  0.0|       0.0|14806|
|  0.0|       1.0|  580|
|  1.0|       0.0| 1509|
+-----+----------+-----+



In [12]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator as MCCE

evaluator = MCCE(metricName="precision")
evaluator.evaluate(pred)


0.901823479650343

In [13]:
pred2 = model.transform(bd_test)
evaluator.evaluate(pred2)

0.9006312581628211

# Tuneado Automático de parámetros

## Ejemplo: Regresión Logística

Código válido para Pyspark v2.0.0 o superior

In [14]:

#from pyspark.ml.tuning import ParamGridBuilder,TrainValidationSplit
    
#from pyspark.ml.classification import LogisticRegression


#lgr = LogisticRegression(maxIter=10, 
     #labelCol="label", 
    # featuresCol="features")
                        
#paramGrid = ParamGridBuilder()\
    #.addGrid(lgr.regParam, [1,0.1]) \
    #.addGrid(lgr.elasticNetParam, [0.0, 1.0])\
    # .build()
    
#tvs = TrainValidationSplit(estimator=lgr,
                          # estimatorParamMaps=paramGrid,
                          # evaluator=BCE(metricName="areaUnderROC"),
                           #trainRatio=0.8)      

In [15]:
#model = tvs.fit(bd_train)

#pred = model.transform(bd_test)
#pred.select("features", "label").show() 

In [16]:
#BCE(metricName="areaUnderROC").evaluate(pred)