In [0]:
print(sc)
print(sc.version)

In [0]:
from  pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
print(spark)

In [0]:
print(spark.catalog.listTables())

In [0]:
flights10 = spark.sql("FROM flights SELECT * LIMIT 10")
flights10.show()

In [0]:
flight_counts = spark.sql("SELECT origin,dest,COUNT(*) as N FROM flights GROUP BY origin,dest")
flight_counts_pd = flight_counts.toPandas()
flight_counts_pd.head()

Unnamed: 0,origin,dest,N
0,SEA,RNO,8
1,SEA,DTW,98
2,SEA,CLE,2
3,SEA,LAX,450
4,PDX,SEA,144


In [0]:
import pandas as pd
import numpy as np

pd_temp = pd.DataFrame(np.random.random(10))
print(pd_temp)
spark_temp = spark.createDataFrame(pd_temp)
spark_temp.show()
print(spark.catalog.listTables()) # table is not added to catalog
spark_temp.createOrReplaceGlobalTempView("stemp")
print(spark.catalog.listTables())

In [0]:
airports = spark.read.csv("/FileStore/tables/airports.csv",header=True)
airports.show()

In [0]:
flights = spark.table("flights")
flights = flights.withColumn("duration_hrs",flights.air_time/60)
flights.show()

In [0]:
long_flights1 = spark.sql("SELECT * FROM flights WHERE distance >1000")
long_flights2 = flights.filter(flights.distance >1000)
long_flights1.show()
long_flights2.show()

In [0]:
selected1 = flights.select("origin","dest","tailnum")
selected2 = flights.filter(flights.origin=="SEA").filter(flights.dest=="PDX").select(flights.origin,flights.dest,flights.carrier)
selected1.show()
selected2.show()

In [0]:
from pyspark.sql.types import IntegerType
flights = flights.withColumn("air_time", flights["air_time"].cast(IntegerType()))
avg_speed = flights.select(flights.distance/flights.duration_hrs.alias("avg_speed"))
speed1=flights.select("origin","dest","tailnum").join(avg_speed)
speed2=flights.selectExpr("origin","dest","tailnum").join(avg_speed)
speed2.show()

In [0]:
flights.filter(flights.origin == "PDX").groupBy().min("distance").show()
flights.filter(flights.origin == "SEA").groupBy().max("air_time").show()

In [0]:
flights.filter(flights.origin == "SEA").groupBy(flights.carrier).avg("air_time").show()
flights.groupBy(flights.tailnum).sum("air_time").show()

In [0]:
by_plane = flights.groupBy("tailnum")
by_plane.count().show()

by_origin = flights.groupBy("origin")
by_origin.avg("air_time").show() 

In [0]:
from pyspark.sql import functions as F

flights = flights.withColumn("dep_delay", flights["dep_delay"].cast(IntegerType()))

by_month_dest = flights.groupby("dest","month")
by_month_dest.avg("dep_delay").show()

by_month_dest.agg(F.stddev("dep_delay")).show()

In [0]:
airports = airports.withColumnRenamed("faa","dest")
flights_with_airports = flights.join(airports, on="dest", how="leftouter")
flights_with_airports.show()

In [0]:
planes = spark.table("planes")
planes = planes.withColumnRenamed("year","plane_year")
model_data = planes.join(flights, on="tailnum", how="leftouter")
model_data.show()

In [0]:
from pyspark.sql.types import IntegerType
model_data = model_data.withColumn("arr_delay", model_data["arr_delay"].cast(IntegerType()))
model_data = model_data.withColumn("air_time", model_data["air_time"].cast(IntegerType()))
model_data = model_data.withColumn("month", model_data["month"].cast(IntegerType()))
model_data = model_data.withColumn("plane_year", model_data["plane_year"].cast(IntegerType()))
model_data.dtypes

In [0]:
model_data = model_data.withColumn("plane_age",model_data.year-model_data.plane_year)
model_data.show()

In [0]:
model_data = model_data.withColumn("is_late",model_data.arr_delay > 0)
model_data = model_data.withColumn("label", model_data["is_late"].cast(IntegerType()))
model_data = model_data.filter("arr_delay is not NULL and dep_delay is not NULL and air_time is not NULL and plane_year is not NULL")

In [0]:
from pyspark.ml.feature import StringIndexer,OneHotEncoder
carr_indexer = StringIndexer(inputCol="carrier", outputCol="carrier_index")
carr_encoder = OneHotEncoder(inputCol="carrier_index", outputCol="carrier_fact")

In [0]:
dest_indexer = StringIndexer(inputCol="dest", outputCol="dest_index")
dest_encoder = OneHotEncoder(inputCol="dest_index", outputCol="dest_fact")

In [0]:
from pyspark.ml.feature import VectorAssembler
vec_assembler = VectorAssembler(inputCols=["month","air_time","carrier_fact","dest_fact","plane_age"],outputCol="features")

In [0]:
from pyspark.ml import Pipeline
flights_pipe = Pipeline(stages =[dest_indexer, dest_encoder, carr_indexer, carr_encoder, vec_assembler])

In [0]:
piped_data = flights_pipe.fit(model_data).transform(model_data)

In [0]:
piped_data = flights_pipe.fit(model_data).transform(model_data)

In [0]:
training,test= piped_data.randomSplit([.6,.4])

In [0]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression()

In [0]:
import pyspark.ml.evaluation as evals
evaluator = evals.BinaryClassificationEvaluator(metricName="areaUnderROC")

In [0]:
import pyspark.ml.tuning as tune
grid = tune.ParamGridBuilder()
grid = grid.addGrid(lr.regParam,np.arange(0,.1,.01))
grid = grid.addGrid(lr.elasticNetParam,[0,1])
grid = grid.build()

In [0]:
cv = tune.CrossValidator(estimator=lr,estimatorParamMaps=grid,evaluator=evaluator)

In [0]:
Best_lr = lr.fit(training)
Best_lr

In [0]:
models = cv.fit(training)
best_lr = models.bestModel
best_lr

In [0]:
test_results = best_lr.transform(test)
print(evaluator.evaluate(test_results))