# Indice

[1.1. Creating the spark session and context](#1.1.)

### 1.1. Creating the spark session and context [Indice](#Indice)

In [1]:
# Import the PySpark module
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

In [2]:
##Create SparkContext
sc = SparkContext.getOrCreate()

# Create SparkSession object
spark = SparkSession.builder.master('local[*]').appName('test').getOrCreate()

In [3]:
# What version of Spark?
print(spark.version)

2.4.4


### Read data from CSV file (infering schema) [Indice](#Indice)

In [4]:
## Is not the best choose for large data sets
flights = spark.read.csv('flights.csv', sep=',',header=True,inferSchema=True, nullValue='NA')

### Exploring data

In [5]:
# Get number of records
print("The data contain %d records." % flights.count())

The data contain 50000 records.


In [6]:
# View the first five records
flights.show(5)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351| null|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65| null|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 5 rows



In [7]:
flights.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
mon,50000,5.2351,3.437758623534696,0,11
dom,50000,15.66196,8.772488135606777,1,31
dow,50000,2.95236,1.966033503314405,0,6
carrier,50000,,,AA,WN
flight,50000,2054.31344,2182.4715300582875,1,6941
org,50000,,,JFK,TUS
mile,50000,882.40112,701.232785607705,67,4243
depart,50000,14.130952600000064,4.694052286573998,0.25,23.98
duration,50000,151.76582,87.04507290261697,30,560


In [8]:
# Check column data types
flights.dtypes

[('mon', 'int'),
 ('dom', 'int'),
 ('dow', 'int'),
 ('carrier', 'string'),
 ('flight', 'int'),
 ('org', 'string'),
 ('mile', 'int'),
 ('depart', 'double'),
 ('duration', 'int'),
 ('delay', 'int')]

### Column manipulation

In [9]:
# Import the required function
from pyspark.sql.functions import round

# Convert 'mile' to 'km' and drop 'mile' column
flights_km = flights.withColumn('km', round(flights.mile * 1.60934, 0)).drop('mile')
flights_km.head()

# Create 'label' column indicating whether flight delayed (1) or not (0)
flights_km = flights_km.withColumn('label', (flights_km.delay >=15).cast('integer'))

# Check first five records
flights = flights_km
flights_km.show(5)

+---+---+---+-------+------+---+------+--------+-----+------+-----+
|mon|dom|dow|carrier|flight|org|depart|duration|delay|    km|label|
+---+---+---+-------+------+---+------+--------+-----+------+-----+
| 11| 20|  6|     US|    19|JFK|  9.48|     351| null|3465.0| null|
|  0| 22|  2|     UA|  1107|ORD| 16.33|      82|   30| 509.0|    1|
|  2| 20|  4|     UA|   226|SFO|  6.17|      82|   -8| 542.0|    0|
|  9| 13|  1|     AA|   419|ORD| 10.33|     195|   -5|1989.0|    0|
|  4|  2|  5|     AA|   325|ORD|  8.92|      65| null| 415.0| null|
+---+---+---+-------+------+---+------+--------+-----+------+-----+
only showing top 5 rows



In [10]:
flights = flights.filter(flights.label.isNotNull())

### Indexing categorical data

In [11]:
from pyspark.ml.feature import StringIndexer

# Create an indexer
indexer = StringIndexer(inputCol='carrier', outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(flights)

# Indexer creates a new column with numeric index values
flights_indexed = indexer_model.transform(flights)

# Repeat the process for the other categorical feature
flights_indexed = StringIndexer(inputCol='org', outputCol='org_idx').fit(flights_indexed).transform(flights_indexed)
#flights_indexed = StringIndexer(inputCol='label_t', outputCol='label').fit(flights_indexed).transform(flights_indexed)

In [12]:
flights = flights_indexed
flights.head()

Row(mon=0, dom=22, dow=2, carrier='UA', flight=1107, org='ORD', depart=16.33, duration=82, delay=30, km=509.0, label=1, carrier_idx=0.0, org_idx=0.0)

### Assembling columns

In [13]:
from pyspark.ml.feature import VectorAssembler

# Create an assembler object

assembler = VectorAssembler(
    inputCols=['mon', 'dom' , 'dow', 'carrier_idx', 'org_idx', 'km', 'depart', 'duration' ], outputCol='features')

# Consolidate predictor columns
flights_assembled = assembler.transform(flights)

In [14]:
#flights_assembled.head()

In [15]:
flights = flights_assembled

In [16]:
flights.head()

Row(mon=0, dom=22, dow=2, carrier='UA', flight=1107, org='ORD', depart=16.33, duration=82, delay=30, km=509.0, label=1, carrier_idx=0.0, org_idx=0.0, features=DenseVector([0.0, 22.0, 2.0, 0.0, 0.0, 509.0, 16.33, 82.0]))

### Train/test split

In [17]:
# Split into training and testing sets in a 80:20 ratio
flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=17)

# Check that training set has around 80% of records
training_ratio = flights_train.count() / flights.count()
print(training_ratio)

0.7980732423121092


In [18]:
flights_train.head()

Row(mon=0, dom=1, dow=2, carrier='AA', flight=3, org='JFK', depart=12.0, duration=370, delay=11, km=3983.0, label=0, carrier_idx=1.0, org_idx=2.0, features=DenseVector([0.0, 1.0, 2.0, 1.0, 2.0, 3983.0, 12.0, 370.0]))

### Build a Decision Tree

In [19]:
# Import the Decision Tree Classifier class
from pyspark.ml.classification import DecisionTreeClassifier

# Create a classifier object and fit to the training data
tree = DecisionTreeClassifier(labelCol='label',featuresCol = 'features')

In [20]:
tree_model = tree.fit(flights_train)

In [21]:
# Create predictions for the testing data and take a look at the predictions
prediction = tree_model.transform(flights_test)
prediction.select('label', 'prediction', 'probability').show(5, False)

+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|1    |1.0       |[0.35975805932694166,0.6402419406730584]|
|1    |1.0       |[0.35975805932694166,0.6402419406730584]|
|1    |0.0       |[0.7702888583218707,0.2297111416781293] |
|1    |1.0       |[0.35975805932694166,0.6402419406730584]|
|0    |1.0       |[0.35975805932694166,0.6402419406730584]|
+-----+----------+----------------------------------------+
only showing top 5 rows



### Evaluate the Decision Tree

In [22]:
# Create a confusion matrix
prediction.groupBy('label', 'prediction').count().show()

# Calculate the elements of the confusion matrix
TN = prediction.filter('prediction = 0 AND label = prediction').count()
TP = prediction.filter('prediction = 1 AND label = prediction').count()
FN = prediction.filter('prediction = 0 AND label != prediction').count()
FP = prediction.filter('prediction = 1 AND label != prediction').count()

# Accuracy measures the proportion of correct predictions
accuracy = (TN + TP) / (TN + TP + FN + FP) 
print(accuracy)

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1139|
|    0|       0.0| 2343|
|    1|       1.0| 3726|
|    0|       1.0| 2287|
+-----+----------+-----+

0.6391785150078989


In [23]:
# Terminate the cluster
spark.stop()