# Reducción de la dimensionalidad

In [1]:
#from pyspark import SparkContext
#sc = SparkContext()
#from pyspark.sql import SQLContext
#sqlContext=SQLContext(sc)

In [2]:
bd5 = sqlContext.read.format(
    "com.databricks.spark.csv"
).option("header", "true").load("bd5.csv", inferSchema=True)
sqlContext.registerDataFrameAsTable(bd5, "bd5")

## Reducción de dimensionalidad: PCA

In [3]:
bd5.dtypes

[('YEAR', 'int'),
 ('MONTH', 'int'),
 ('DAY_OF_MONTH', 'int'),
 ('DAY_OF_WEEK', 'int'),
 ('CRS_DEP_TIME', 'int'),
 ('OP_UNIQUE_CARRIER', 'string'),
 ('TAIL_NUM', 'string'),
 ('ARR_DELAY', 'double'),
 ('DEP_DELAY', 'double'),
 ('ORIGIN', 'string'),
 ('DEST', 'string'),
 ('DISTANCE', 'double'),
 ('CANCELLED', 'double'),
 ('DIVERTED', 'double'),
 ('CARRIER_DELAY', 'double'),
 ('WEATHER_DELAY', 'double'),
 ('NAS_DELAY', 'double'),
 ('SECURITY_DELAY', 'double'),
 ('LATE_AIRCRAFT_DELAY', 'double'),
 ('LogD', 'double'),
 ('Retraso', 'int'),
 ('RetrasoNeto', 'double'),
 ('Horario', 'int')]

In [4]:
from pyspark.ml.feature import VectorAssembler

a1  = VectorAssembler(
    inputCols=['DEP_DELAY','DISTANCE','DAY_OF_WEEK',
               'CRS_DEP_TIME','Horario','LogD'],
    outputCol='features')

bd6 = a1.transform(bd5)

In [5]:
bd6

DataFrame[YEAR: int, MONTH: int, DAY_OF_MONTH: int, DAY_OF_WEEK: int, CRS_DEP_TIME: int, OP_UNIQUE_CARRIER: string, TAIL_NUM: string, ARR_DELAY: double, DEP_DELAY: double, ORIGIN: string, DEST: string, DISTANCE: double, CANCELLED: double, DIVERTED: double, CARRIER_DELAY: double, WEATHER_DELAY: double, NAS_DELAY: double, SECURITY_DELAY: double, LATE_AIRCRAFT_DELAY: double, LogD: double, Retraso: int, RetrasoNeto: double, Horario: int, features: vector]

## PCA sin estandarización

In [6]:
from pyspark.ml.feature import PCA

#2 componentes
pca=PCA(k=2,inputCol='features',outputCol='pca_features')

In [7]:
model=pca.fit(bd6)
bd6pca=model.transform(bd6)
bd6pca.select('features','pca_features').show()

+--------------------+--------------------+
|            features|        pca_features|
+--------------------+--------------------+
|[-8.0,236.0,4.0,1...|[-377.10827923344...|
|[6.0,236.0,4.0,12...|[-294.90822681198...|
|[-5.0,236.0,4.0,1...|[-206.02454360753...|
|[-6.0,236.0,4.0,8...|[-122.11787625733...|
|[-5.0,651.0,4.0,2...|[-280.57987586646...|
|[-5.0,370.0,4.0,1...|[-333.12511833861...|
|[-8.0,868.0,4.0,1...|[-14.500940557834...|
|[-6.0,1464.0,4.0,...|[984.861368360190...|
|[-3.0,1464.0,4.0,...|[863.108091742342...|
|[-6.0,1055.0,4.0,...|[367.342882157714...|
|[-11.0,255.0,4.0,...|[-444.09198996144...|
|[0.0,1440.0,4.0,1...|[510.132577158998...|
|[1.0,641.0,4.0,22...|[-323.85625864009...|
|[2.0,1440.0,4.0,1...|[901.259605069138...|
|[-9.0,1055.0,4.0,...|[447.556161089783...|
|[5.0,1055.0,4.0,1...|[369.354242249321...|
|[7.0,370.0,4.0,20...|[-507.97131150795...|
|[-4.0,1055.0,4.0,...|[669.631895215097...|
|[-2.0,1055.0,4.0,...|[608.341328002556...|
|[-7.0,1464.0,4.0,...|[1313.8957

In [8]:
#Extraemos la primera y segunda componentes

from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

p1=udf(lambda v:float(v[0]),FloatType())
p2=udf(lambda v:float(v[1]),FloatType())

bd6pca=bd6pca.withColumn('pca1',p1('pca_features')).withColumn('pca2',p2('pca_features'))


In [9]:
bd6pca.select('pca1','pca2').describe().show()

+-------+------------------+-------------------+
|summary|              pca1|               pca2|
+-------+------------------+-------------------+
|  count|             30466|              30466|
|   mean|300.93254488836135|-1601.4748862329209|
| stddev| 557.2501211351527| 495.72119764117116|
|    min|        -731.57623|         -2935.9211|
|    max|          1751.152|         -487.57773|
+-------+------------------+-------------------+



## PCA con estandarización

In [10]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=True)
scalerModel = scaler.fit(bd6)
bd6std = scalerModel.transform(bd6)

bd6std.select('features','scaledFeatures').show()

+--------------------+--------------------+
|            features|      scaledFeatures|
+--------------------+--------------------+
|[-8.0,236.0,4.0,1...|[-0.5061531206197...|
|[6.0,236.0,4.0,12...|[-0.2251841350618...|
|[-5.0,236.0,4.0,1...|[-0.4459454808573...|
|[-6.0,236.0,4.0,8...|[-0.4660146941114...|
|[-5.0,651.0,4.0,2...|[-0.4459454808573...|
|[-5.0,370.0,4.0,1...|[-0.4459454808573...|
|[-8.0,868.0,4.0,1...|[-0.5061531206197...|
|[-6.0,1464.0,4.0,...|[-0.4660146941114...|
|[-3.0,1464.0,4.0,...|[-0.4058070543490...|
|[-6.0,1055.0,4.0,...|[-0.4660146941114...|
|[-11.0,255.0,4.0,...|[-0.5663607603821...|
|[0.0,1440.0,4.0,1...|[-0.3455994145866...|
|[1.0,641.0,4.0,22...|[-0.3255302013325...|
|[2.0,1440.0,4.0,1...|[-0.3054609880783...|
|[-9.0,1055.0,4.0,...|[-0.5262223338738...|
|[5.0,1055.0,4.0,1...|[-0.2452533483159...|
|[7.0,370.0,4.0,20...|[-0.2051149218077...|
|[-4.0,1055.0,4.0,...|[-0.4258762676032...|
|[-2.0,1055.0,4.0,...|[-0.3857378410949...|
|[-7.0,1464.0,4.0,...|[-0.486083

In [11]:
from pyspark.ml.feature import PCA

#2 componentes
pca2=PCA(k=2,inputCol='scaledFeatures',outputCol='pca_scaledfeatures')

In [12]:
model2=pca2.fit(bd6std)
bd6pca2=model2.transform(bd6std)
bd6pca2.select('pca_scaledfeatures').show()


+--------------------+
|  pca_scaledfeatures|
+--------------------+
|[-2.1481167921052...|
|[-2.0580632199285...|
|[-1.6945582244580...|
|[-1.5824805413590...|
|[-0.5458187260638...|
|[-1.8750035509122...|
|[-0.5990634736882...|
|[1.69894829802062...|
|[1.53427565410245...|
|[0.32786666838261...|
|[-2.3865997873638...|
|[0.58471397283816...|
|[-0.6262974930119...|
|[1.56173170587968...|
|[0.43770513210056...|
|[0.31602276092994...|
|[-2.1216234060703...|
|[1.18761526357020...|
|[0.87239689347422...|
|[2.36627932235609...|
+--------------------+
only showing top 20 rows



In [13]:
bd6pca2=bd6pca2.withColumn('pca1',p1('pca_scaledfeatures')).withColumn('pca2',p2('pca_scaledfeatures')) 
bd6pca2.select('pca1','pca2').describe().toPandas()

Unnamed: 0,summary,pca1,pca2
0,count,30466.0,30466.0
1,mean,1.4992589980211242e-10,-2.3862638117704308e-11
2,stddev,1.4247512515634866,1.2476208654224723
3,min,-3.9466622,-9.316214
4,max,3.2699375,2.831249


Nota: Las componentes obtenidas también se pueden estandarizar de nuevo.

In [14]:
pdf6 = bd6pca2.sample(False,0.1).select('DEP_DELAY','DISTANCE','DAY_OF_WEEK',
               'CRS_DEP_TIME','Horario','LogD','pca1','pca2').toPandas()

pdf6.corr()

Unnamed: 0,DEP_DELAY,DISTANCE,DAY_OF_WEEK,CRS_DEP_TIME,Horario,LogD,pca1,pca2
DEP_DELAY,1.0,0.00607,0.025261,0.159572,0.119988,0.002001,-0.090915,-0.319393
DISTANCE,0.00607,1.0,-0.024106,-0.075962,-0.083686,0.957575,0.926214,-0.342577
DAY_OF_WEEK,0.025261,-0.024106,1.0,-0.008006,-0.009551,-0.021388,-0.027716,-0.007033
CRS_DEP_TIME,0.159572,-0.075962,-0.008006,1.0,0.591198,-0.0858,-0.386719,-0.794837
Horario,0.119988,-0.083686,-0.009551,0.591198,1.0,-0.083825,-0.384042,-0.786727
LogD,0.002001,0.957575,-0.021388,-0.0858,-0.083825,1.0,0.928296,-0.336689
pca1,-0.090915,0.926214,-0.027716,-0.386719,-0.384042,0.928296,1.0,0.00489
pca2,-0.319393,-0.342577,-0.007033,-0.794837,-0.786727,-0.336689,0.00489,1.0
