In [8]:
#import findspark 
#findspark.init()

import pyspark
import pandas as pd
import numpy as np

### Configuración de sesión de spark

In [None]:
# Verify SparkContext
print(sc)

# Print Spark version
print(sc.version)

### Creación de la sesión de spark

In [None]:
# Import SparkSession from 
from pyspark.sql import SparkSession

# Create my_spark
my_spark = SparkSession.builder.getOrCreate()

# Print my_spark
print(my_spark)

### Lista de tablas en el catalogo

In [None]:
# Print the tables in the catalog
print(spark.catalog.listTables())

### Consulta datos en el catalogo

In [None]:
query = "SELECT origin, dest, COUNT(*) as N FROM flights GROUP BY origin, dest"

# Run the query
flight_counts = spark.sql(query)

# Show the results
flight_counts.show()

### Conversión de spark dataframe a dataframe pandas

In [None]:
# Convert the results to a pandas DataFrame
pd_counts = flight_counts.toPandas()

# Print the head of pd_counts
print(pd_counts.head())

# Show the results
flights10.show()

### Creación de tabla en el catalogo de tablas de spark

In [None]:
# Create a Datafram
pd_temp = pd.DataFrame(np.random.random(10))

# Create spark_temp from pd_temp
spark_temp = spark.createDataFrame(pd_temp)

# Examine the tables in the catalog
print(spark.catalog.listTables())

# Add spark_temp to the catalog
spark_temp.createOrReplaceTempView("temp")

# Examine the tables in the catalog again
print(spark.catalog.listTables())

### Leyendo información de un archivo

In [None]:
# Setting the file path
file_path = "/usr/local/share/datasets/airports.csv"

# Read in the airports data
airports = spark.read.csv(file_path,header=True)

# Show the data
airports.show()

### Operaciones con Columnas (Creación)

In [None]:
#spark.catalog.listTables()
flights = spark.table("flights")

# Show the head
flights.show()

##### Operaciones con Columnas (Creación I)

In [None]:
# Add duration_hrs to the dataframe
flights = flights.withColumn("duration_hrs",flights.air_time/60)
model_data = model_data.withColumn("plane_age", model_data.year - model_data.plane_year)

##### Operaciones con Columnas (Creación II)

In [None]:
flights.air_time = flights.select(flights.air_time/60)

##### Operaciones con Columnas (Creación III)

In [None]:
# Define avg_speed
avg_speed = (flights.distance/(flights.air_time/60)).alias("avg_speed")

# Select the correct columns
speed1 = flights.select("origin", "dest", "tailnum", avg_speed)

# Create the same table using a SQL expression
speed2 = flights.selectExpr("origin", "dest", "tailnum", "distance/(air_time/60) as avg_speed")

### Operaciones con Columnas (Renombrar)

In [None]:
flights.select((flights.air_time/60).alias("duration_hrs"))

# with the SQL as keyword being equivalent to the .alias() method. 
flights.selectExpr("air_time/60 as duration_hrs")

airports = airports.withColumnRenamed("faa", "dest")

### Selección de columnas

In [None]:
# Select the first set of columns
selected1 = flights.select('tailnum','origin','dest')

# Select the second set of columns
temp = flights.select(flights.origin,flights.dest,flights.carrier)

### Filtrado de Columnas

In [None]:
flights.filter("air_time > 120").show()
model_data = model_data.filter("arr_delay is not NULL and dep_delay is not NULL and air_time is not NULL and plane_year is not NULL")
flights.filter(flights.air_time > 120).show()

The difference between .select() and .withColumn() methods is that .select() returns only the columns you specify, while .withColumn() returns all the columns of the DataFrame in addition to the one you defined.

In [None]:
# Define first filter
filterA = flights.origin == "SEA"

# Define second filter
filterB = flights.dest == "PDX"

# Filter the data, first by filterA then by filterB
selected2 = temp.filter(filterA).filter(filterB)
#print(selected2.head())

### Agregación de columnas

In [None]:
# Find the shortest flight from PDX in terms of distance
flights.filter(flights.origin == 'PDX').groupBy().min("distance").show()

# Find the longest flight from SEA in terms of air time
flights.filter(flights.origin == 'SEA').groupBy().max("air_time").show()

In [None]:
# Average duration of Delta flights
print(flights.show())
flights.filter(flights.carrier == 'DL').filter(flights.origin == 'SEA').groupBy().avg('air_time').show()

# Total hours in the air
flights.withColumn("duration_hrs", flights.air_time/60).groupBy().sum("duration_hrs").show()

In addition to the GroupedData methods you've already seen, there is also the .agg() method. This method lets you pass an aggregate column expression that uses any of the aggregate functions from the pyspark.sql.functions submodule.

In [None]:
# Import pyspark.sql.functions as F
import pyspark.sql.functions as F

# Group by month and dest
by_month_dest = flights.groupBy("month", "dest")

# Average departure delay by month and destination
by_month_dest.avg("dep_delay").show()

# Standard deviation of departure delay
by_month_dest.agg(F.stddev("dep_delay")).show()

### Join

In PySpark, joins are performed using the DataFrame method .join(). This method takes three arguments. The first is the second DataFrame that you want to join with the first one. The second argument, on, is the name of the key column(s) as a string. The names of the key column(s) must be the same in each table. The third argument, how, specifies the kind of join to perform. In this course we'll always use the value how="leftouter".

In [None]:
# Examine the data
print(airports.show())

# Rename the faa column
airports = airports.withColumnRenamed("faa", "dest")

# Join the DataFrames
flights_with_airports = flights.join(airports,"dest",how="leftouter")
model_data = flights.join(planes, on='tailnum', how="leftouter")

# Examine the new DataFrame
print(flights_with_airports.show())

### Cast

In [None]:
dataframe = dataframe.withColumn("col", dataframe.col.cast("new_type"))

model_data = model_data.withColumn("arr_delay", model_data.arr_delay.cast("integer"))

### One hot encoding - Binnarization 

In [None]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

# Create a StringIndexer
carr_indexer = StringIndexer(inputCol="carrier",outputCol="carrier_index")
print(type(carr_indexer))

# Create a OneHotEncoder
carr_encoder = OneHotEncoder(inputCol="carrier_index",outputCol="carrier_fact")
print(type(carr_encoder))

### Make a VectorAssembler

In [None]:
from pyspark.ml.feature import VectorAssembler
# Make a VectorAssembler
vec_assembler = VectorAssembler(inputCols=["month", "air_time", "carrier_fact", "dest_fact", "plane_age"], outputCol="features")

In [None]:
# Import Pipeline
from pyspark.ml import Pipeline

### Test vs Train

In Spark it's important to make sure you split the data after all the transformations. This is because operations like StringIndexer don't always produce the same index even when given the same list of strings.

In [None]:
# Fit and transform the data
piped_data = flights_pipe.fit(model_data).transform(model_data)

# Split the data into training and test sets
training, test = piped_data.randomSplit([.6, .4])

### What is logistic regression?

The model you'll be fitting in this chapter is called a logistic regression. This model is very similar to a linear regression, but instead of predicting a numeric variable, it predicts the probability (between 0 and 1) of an event.

To use this as a classification algorithm, all you have to do is assign a cutoff point to these probabilities. If the predicted probability is above the cutoff point, you classify that observation as a 'yes' (in this case, the flight being late), if it's below, you classify it as a 'no'!

In [None]:
# Import LogisticRegression
from pyspark.ml.classification import LogisticRegression

# Create a LogisticRegression Estimator
lr = LogisticRegression()

### Create the evaluator - Cross fold validation

In [None]:
# Import the evaluation submodule
import pyspark.ml.evaluation as evals

# Create a BinaryClassificationEvaluator
evaluator = evals.BinaryClassificationEvaluator(metricName="areaUnderROC")

The submodule pyspark.ml.tune has already been imported as tune. You'll create the CrossValidator by passing it the logistic regression Estimator lr, the parameter grid, and the evaluator you created in the previous exercises.

In [None]:
import pyspark.ml.tuning as tune
# Create the CrossValidator
cv = tune.CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)

### Tuning a model

In [None]:
# Import the tuning submodule
import pyspark.ml.tuning as tune

# Create the parameter grid
grid = tune.ParamGridBuilder()

# Add the hyperparameter
grid = grid.addGrid(lr.regParam, np.arange(0, .1, .01))
grid = grid.addGrid(lr.elasticNetParam, [0, 1])

# Build the grid
grid = grid.build()

### Fit the model(s)

In [None]:
# Fit cross validation models
models = cv.fit(training)

# Extract the best model
best_lr = models.bestModel

In [None]:
best_lr = lr.fit(training)

### Making predictions

In [None]:
# Use the model to predict the test set
test_results = best_lr.transform(test)

# Evaluate the predictions
print(evaluator.evaluate(test_results))