In [1]:
bd = sqlContext.read.format(
    "com.databricks.spark.csv"
).option("header", "true").load("hdfs:///tmp/dcd/OnTimeDB", inferSchema=True)
sqlContext.registerDataFrameAsTable(bd, "bd")
bd.count()


                                                                                

30466

In [5]:
bd.dtypes

[('Year', 'int'),
 ('Month', 'int'),
 ('DayofMonth', 'int'),
 ('DayOfWeek', 'int'),
 ('CRSDepTime', 'int'),
 ('UniqueCarrier', 'string'),
 ('TailNum', 'string'),
 ('ArrDelay', 'double'),
 ('DepDelay', 'double'),
 ('Origin', 'string'),
 ('Dest', 'string'),
 ('Distance', 'double'),
 ('Cancelled', 'double'),
 ('Diverted', 'double'),
 ('CarrierDelay', 'double'),
 ('WeatherDelay', 'double'),
 ('NASDelay', 'double'),
 ('SecurityDelay', 'double'),
 ('LateAircraftDelay', 'double'),
 ('LogD', 'double'),
 ('Retraso', 'int'),
 ('RetrasoNeto', 'double'),
 ('Horario', 'int')]

In [2]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol='UniqueCarrier',outputCol='IndexUniqueCarrier') #el índice empieza en el 0!
bd1=indexer.fit(bd).transform(bd)


                                                                                

In [3]:
bd1.groupBy('UniqueCarrier','IndexUniqueCarrier').count().sort('IndexUniqueCarrier').show()


[Stage 8:>                                                          (0 + 2) / 2]

+-------------+------------------+-----+
|UniqueCarrier|IndexUniqueCarrier|count|
+-------------+------------------+-----+
|           AA|               0.0| 8853|
|           UA|               1.0| 6112|
|           WN|               2.0| 5395|
|           DL|               3.0| 4239|
|           VX|               4.0| 1703|
|           NK|               5.0| 1581|
|           F9|               6.0| 1295|
|           OO|               7.0| 1166|
|           B6|               8.0|  121|
|           EV|               9.0|    1|
+-------------+------------------+-----+



                                                                                

In [6]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.sql.functions import col


In [7]:
a1  = VectorAssembler(
    inputCols=['DepDelay','Distance','DayOfWeek',
               'CRSDepTime','IndexUniqueCarrier'],
    outputCol='features')

bd2 = a1.transform(bd1).select(col("Retraso").cast('double').alias("label"),'features')

stringIndexer = StringIndexer(inputCol = 'label', outputCol = 'label2')
sI = stringIndexer.fit(bd2)
bd2 = sI.transform(bd2)
bd2.dtypes


[('label', 'double'), ('features', 'vector'), ('label2', 'double')]

In [8]:
bd2.show(5)


+-----+--------------------+------+
|label|            features|label2|
+-----+--------------------+------+
|  0.0|[-5.0,1235.0,4.0,...|   0.0|
|  0.0|[5.0,1235.0,5.0,8...|   0.0|
|  0.0|[-3.0,1235.0,6.0,...|   0.0|
|  0.0|[-7.0,1235.0,7.0,...|   0.0|
|  0.0|[-6.0,1235.0,1.0,...|   0.0|
+-----+--------------------+------+
only showing top 5 rows



In [9]:
#70% Train
#30% Test
(bd_train, bd_test) = bd2.randomSplit([0.7, 0.3],seed=123)
print("Renglones de la BD Train: ", bd_train.count())
print("Renglones de la BD Test: ",bd_test.count())


                                                                                

Renglones de la BD Train:  21219
Renglones de la BD Test:  9247


In [10]:
from pyspark.ml.classification import DecisionTreeClassifier as DTC

rt = DTC(maxDepth=5, labelCol = 'label2')

model = rt.fit(bd_train)
pred = model.transform(bd_train)



                                                                                

In [11]:
pred.show()

+-----+--------------------+------+---------------+--------------------+----------+
|label|            features|label2|  rawPrediction|         probability|prediction|
+-----+--------------------+------+---------------+--------------------+----------+
|  0.0|[-21.0,868.0,6.0,...|   0.0|[13665.0,985.0]|[0.93276450511945...|       0.0|
|  0.0|[-20.0,1440.0,6.0...|   0.0|[13665.0,985.0]|[0.93276450511945...|       0.0|
|  0.0|[-19.0,1440.0,3.0...|   0.0|[13665.0,985.0]|[0.93276450511945...|       0.0|
|  0.0|[-18.0,602.0,5.0,...|   0.0|[13665.0,985.0]|[0.93276450511945...|       0.0|
|  0.0|[-17.0,888.0,6.0,...|   0.0|[13665.0,985.0]|[0.93276450511945...|       0.0|
|  0.0|[-17.0,1440.0,1.0...|   0.0|[13665.0,985.0]|[0.93276450511945...|       0.0|
|  0.0|[-17.0,1744.0,1.0...|   0.0|[13665.0,985.0]|[0.93276450511945...|       0.0|
|  0.0|[-16.0,641.0,6.0,...|   0.0|[13665.0,985.0]|[0.93276450511945...|       0.0|
|  0.0|[-16.0,868.0,6.0,...|   0.0|[13665.0,985.0]|[0.93276450511945...|    

In [12]:
pred.groupBy('probability').count().sort('count').show(50)



+--------------------+-----+
|         probability|count|
+--------------------+-----+
|           [1.0,0.0]|    2|
|[0.65116279069767...|   43|
|[0.35526315789473...|   76|
|[0.25984251968503...|  127|
|[0.66666666666666...|  144|
|[0.50531914893617...|  188|
|[0.09150326797385...|  306|
|[0.46683673469387...|  392|
|[0.27331887201735...|  461|
|[0.70650032829940...| 1523|
|[0.01118838826731...| 3307|
|[0.93276450511945...|14650|
+--------------------+-----+



                                                                                

In [13]:
pred.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0| 4235|
|  0.0|       1.0|  434|
|  1.0|       0.0| 1588|
|  0.0|       0.0|14962|
+-----+----------+-----+



In [14]:
#Generar algunas estadísticas para tener una idea de cómo fueron las predicciones

numSuccesses = pred.where("""(prediction = 0.0 AND label2 = 0.0) OR (prediction = 1.0 AND label2 = 1.0)""").count()
numInspections = pred.count()

print ("Se realizaron", numInspections, "inspeciones y existen", numSuccesses, "predicciones existosas")
print ("Esta es una tasa de éxito del", str((float(numSuccesses) / float(numInspections)) * 100) + "%")


Se realizaron 21219 inspeciones y existen 19197 predicciones existosas
Esta es una tasa de éxito del 90.47080446769404%
