In [1]:
import numpy as np
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import *

spark = SparkSession.builder.getOrCreate()
spark

In [2]:
flights = spark.read.csv('flights_small.csv', header=True)
airports = spark.read.csv('airports.csv', header=True)
planes = spark.read.csv('planes.csv', header=True)

In [3]:
# Rename year column
planes = planes.withColumnRenamed('year', 'plane_year')

# Join the DataFrames
model_data = flights.join(planes, on="tailnum", how="leftouter")
model_data

DataFrame[tailnum: string, year: string, month: string, day: string, dep_time: string, dep_delay: string, arr_time: string, arr_delay: string, carrier: string, flight: string, origin: string, dest: string, air_time: string, distance: string, hour: string, minute: string, plane_year: string, type: string, manufacturer: string, model: string, engines: string, seats: string, speed: string, engine: string]

In [4]:
type(model_data)

pyspark.sql.dataframe.DataFrame

In [5]:
# Cast the columns to integers
model_data = model_data.withColumn("arr_delay", model_data.arr_delay.cast("integer"))
model_data = model_data.withColumn("air_time", model_data.air_time.cast("integer"))
model_data = model_data.withColumn("month", model_data.month.cast("integer"))
model_data = model_data.withColumn("plane_year", model_data.plane_year.cast("integer"))
model_data

DataFrame[tailnum: string, year: string, month: int, day: string, dep_time: string, dep_delay: string, arr_time: string, arr_delay: int, carrier: string, flight: string, origin: string, dest: string, air_time: int, distance: string, hour: string, minute: string, plane_year: int, type: string, manufacturer: string, model: string, engines: string, seats: string, speed: string, engine: string]

In [6]:
# Create the column plane_age
model_data = model_data.withColumn("plane_age", model_data.year-model_data.plane_year)

# Create is_late
model_data = model_data.withColumn("is_late", model_data.arr_delay > 0)

# Convert to an integer
model_data = model_data.withColumn("label", model_data.is_late.cast('integer'))

# Remove missing values
model_data = model_data.filter("arr_delay is not NULL and dep_delay is not NULL and air_time is not NULL and plane_year is not NULL")

In [7]:
# Create a StringIndexer
carr_indexer = StringIndexer(inputCol="carrier", outputCol="carrier_index")

# Create a OneHotEncoder
carr_encoder = OneHotEncoder(inputCol="carrier_index", outputCol="carrier_fact")

# Create a StringIndexer
dest_indexer = StringIndexer(inputCol="dest", outputCol="dest_index")

# Create a OneHotEncoder
dest_encoder = OneHotEncoder(inputCol="dest_index", outputCol="dest_fact")

In [8]:
# Make a VectorAssembler
vec_assembler = VectorAssembler(inputCols=["month", "air_time", "carrier_fact", "dest_fact", "plane_age"], outputCol="features")

In [9]:
# Make the pipeline
flights_pipe = Pipeline(stages=[dest_indexer, dest_encoder, carr_indexer, carr_encoder, vec_assembler])

In [10]:
piped_data = flights_pipe.fit(model_data).transform(model_data)

In [11]:
# Split the data into training and test sets
training, test = piped_data.randomSplit([.6, .4])

In [12]:
lr = LogisticRegression()
# Create a BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC")

In [13]:
# Create the parameter grid
grid = ParamGridBuilder()

# Add the hyperparameter
grid = grid.addGrid(lr.regParam, np.arange(0, .1, .01))
grid = grid.addGrid(lr.elasticNetParam, [0, 1])

# Build the grid
grid = grid.build()

# Create the CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)

# Fit cross validation models
models = cv.fit(training)

# Extract the best model
best_lr = models.bestModel

print(best_lr)

LogisticRegressionModel: uid = LogisticRegression_fabce495dda5, numClasses = 2, numFeatures = 81


In [14]:
# Use the model to predict the test set
test_results = best_lr.transform(test)

# Evaluate the predictions
print(evaluator.evaluate(test_results))

0.6893143311969976
