# Creación de Nuevas Variables - Feature Extraction

In [1]:
#from pyspark import SparkContext
#sc = SparkContext()
#from pyspark.sql import SQLContext
#sqlContext=SQLContext(sc)

In [2]:
bd5 = sqlContext.read.format(
    "com.databricks.spark.csv"
).option("header", "true").load("bd5.csv", inferSchema=True)
sqlContext.registerDataFrameAsTable(bd5, "bd5")

## Variables Dummy

In [3]:
bd5 = bd5.withColumn('Horario1',(bd5.Horario==1) 
).withColumn('Horario2',(bd5.Horario==2) 
).withColumn('Horario3',(bd5.Horario==3))

## Variables Discretizadas Binarias

In [4]:
from pyspark.ml.feature import Binarizer

binarizer = Binarizer(threshold=15.0, inputCol='DEP_DELAY', outputCol='SalidaBin')
binarizer.transform(bd5).head()


Row(YEAR=2016, MONTH=12, DAY_OF_MONTH=1, DAY_OF_WEEK=4, CRS_DEP_TIME=1440, OP_UNIQUE_CARRIER='AA', TAIL_NUM='N011AA', ARR_DELAY=-19.0, DEP_DELAY=-8.0, ORIGIN='LAS', DEST='LAX', DISTANCE=236.0, CANCELLED=0.0, DIVERTED=0.0, CARRIER_DELAY=0.0, WEATHER_DELAY=0.0, NAS_DELAY=0.0, SECURITY_DELAY=0.0, LATE_AIRCRAFT_DELAY=0.0, LogD=2.3729120029701067, Retraso=0, RetrasoNeto=-11.0, Horario=3, Horario1=False, Horario2=False, Horario3=True, SalidaBin=0.0)

In [5]:
binarizer.transform(bd5).select('DEP_DELAY','SalidaBin').show()

+---------+---------+
|DEP_DELAY|SalidaBin|
+---------+---------+
|     -8.0|      0.0|
|      6.0|      0.0|
|     -5.0|      0.0|
|     -6.0|      0.0|
|     -5.0|      0.0|
|     -5.0|      0.0|
|     -8.0|      0.0|
|     -6.0|      0.0|
|     -3.0|      0.0|
|     -6.0|      0.0|
|    -11.0|      0.0|
|      0.0|      0.0|
|      1.0|      0.0|
|      2.0|      0.0|
|     -9.0|      0.0|
|      5.0|      0.0|
|      7.0|      0.0|
|     -4.0|      0.0|
|     -2.0|      0.0|
|     -7.0|      0.0|
+---------+---------+
only showing top 20 rows



## Variables Discretizadas en Buckets

In [6]:
from pyspark.ml.feature import Bucketizer
bucketizer = Bucketizer(splits=[-float("inf"), 0.0, 15.0, float("inf")],
                        inputCol='DEP_DELAY', outputCol='SalidaCat')
bucketizer.transform(bd5).select('DEP_DELAY','SalidaCat').show()


+---------+---------+
|DEP_DELAY|SalidaCat|
+---------+---------+
|     -8.0|      0.0|
|      6.0|      1.0|
|     -5.0|      0.0|
|     -6.0|      0.0|
|     -5.0|      0.0|
|     -5.0|      0.0|
|     -8.0|      0.0|
|     -6.0|      0.0|
|     -3.0|      0.0|
|     -6.0|      0.0|
|    -11.0|      0.0|
|      0.0|      1.0|
|      1.0|      1.0|
|      2.0|      1.0|
|     -9.0|      0.0|
|      5.0|      1.0|
|      7.0|      1.0|
|     -4.0|      0.0|
|     -2.0|      0.0|
|     -7.0|      0.0|
+---------+---------+
only showing top 20 rows



Versiones más nuevas de Pyspark incluyen otras transformaciones, por ejemplo QuantileDiscretizer

## Expansión polinómica de Variables 
(términos cuadráticos, productos, etc.) 

In [7]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PolynomialExpansion

assembler = VectorAssembler(
    inputCols=['DEP_DELAY','DISTANCE'],
    outputCol='features')

px = PolynomialExpansion(
    degree=2, 
    inputCol="features", 
    outputCol="Polyn")

bd6 = px.transform(assembler.transform(bd5))

bd6.select('DEP_DELAY','DISTANCE','Polyn').head(5)

[Row(DEP_DELAY=-8.0, DISTANCE=236.0, Polyn=DenseVector([-8.0, 64.0, 236.0, -1888.0, 55696.0])),
 Row(DEP_DELAY=6.0, DISTANCE=236.0, Polyn=DenseVector([6.0, 36.0, 236.0, 1416.0, 55696.0])),
 Row(DEP_DELAY=-5.0, DISTANCE=236.0, Polyn=DenseVector([-5.0, 25.0, 236.0, -1180.0, 55696.0])),
 Row(DEP_DELAY=-6.0, DISTANCE=236.0, Polyn=DenseVector([-6.0, 36.0, 236.0, -1416.0, 55696.0])),
 Row(DEP_DELAY=-5.0, DISTANCE=651.0, Polyn=DenseVector([-5.0, 25.0, 651.0, -3255.0, 423801.0]))]

## Estandarización de las variables

In [8]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="stdfeatures",
                        withStd=True, withMean=True)
scalerModel = scaler.fit(bd6)
bd6std = scalerModel.transform(bd6)

bd6std.select('features','stdfeatures').show()

+-------------+--------------------+
|     features|         stdfeatures|
+-------------+--------------------+
| [-8.0,236.0]|[-0.5061531206197...|
|  [6.0,236.0]|[-0.2251841350618...|
| [-5.0,236.0]|[-0.4459454808573...|
| [-6.0,236.0]|[-0.4660146941114...|
| [-5.0,651.0]|[-0.4459454808573...|
| [-5.0,370.0]|[-0.4459454808573...|
| [-8.0,868.0]|[-0.5061531206197...|
|[-6.0,1464.0]|[-0.4660146941114...|
|[-3.0,1464.0]|[-0.4058070543490...|
|[-6.0,1055.0]|[-0.4660146941114...|
|[-11.0,255.0]|[-0.5663607603821...|
| [0.0,1440.0]|[-0.3455994145866...|
|  [1.0,641.0]|[-0.3255302013325...|
| [2.0,1440.0]|[-0.3054609880783...|
|[-9.0,1055.0]|[-0.5262223338738...|
| [5.0,1055.0]|[-0.2452533483159...|
|  [7.0,370.0]|[-0.2051149218077...|
|[-4.0,1055.0]|[-0.4258762676032...|
|[-2.0,1055.0]|[-0.3857378410949...|
|[-7.0,1464.0]|[-0.4860839073656...|
+-------------+--------------------+
only showing top 20 rows



## Tranformación manual

In [10]:
bd7 = bd6.withColumn('DepDelay2',(bd6.DEP_DELAY**2)
).withColumn('DepD_Distance',(bd6.DEP_DELAY * bd6.DISTANCE))