In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder .master('local[*]').appName('test').getOrCreate()
print(spark.version)
#spark.stop()

In [0]:
flights = spark.read.csv('/FileStore/tables/flights3.csv',header=True,inferSchema=True)
print("The data contain %d records." % flights.count())
flights.show(5)
print(flights.dtypes)

In [0]:
flights_drop_column = flights.drop('flight')
flights_drop_column.filter('delay IS NULL').count()

flights_valid_delay = flights_drop_column.filter('delay IS NULL')

flights_none_missing = flights_valid_delay.dropna()
print(flights_none_missing.count())

In [0]:
from pyspark.sql.functions import round

flights = flights.withColumn('km', round(flights.mile * 1.60934, 0)).drop('mile')

flights = flights.withColumn('label', (flights.delay >= 15).cast('integer'))

flights.show(5)

In [0]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol = 'carrier', outputCol = 'carrier_idx')
indexer_model = indexer.fit(flights)
flights_indexed = indexer_model.transform(flights)

flights_indexed= StringIndexer(inputCol ='org', outputCol ='org_idx').fit(flights_indexed).transform(flights_indexed)

In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=[ 'mon', 'dom', 'dow', 'carrier_idx', 'org_idx', 'km', 'depart', 'duration'], outputCol='features')
flights_assembled = assembler.transform(flights_indexed)
flights_assembled.select('features', 'delay').show(5, truncate = False)

In [0]:
flights_assembled = flights_assembled.dropna()
flights_train, flights_test = flights_assembled.randomSplit([0.8, 0.2], seed=17)

training_ratio = flights_train.count() / flights.count()
print(training_ratio)

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier

flights_train = flights_train.dropna()
Tree = DecisionTreeClassifier()
Tree_model = Tree.fit(flights_train)

prediction = Tree_model.transform(flights_test)
prediction.select('label','prediction','probability').show()

In [0]:
prediction.groupBy('label','prediction').count().show()
TN = prediction.filter('prediction = 0 AND label = prediction').count()
TP = prediction.filter('prediction = 1 AND label = prediction').count()
FN = prediction.filter('prediction = 0 AND label != prediction').count()
FP = prediction.filter('prediction = 1 AND label != prediction').count()

accuracy = (TN+TP)/(TN+TP+FN+FP)
print(accuracy)

In [0]:
from pyspark.ml.classification import LogisticRegression

logistic = LogisticRegression()
logistic_model = logistic.fit(flights_train)

prediction = logistic_model.transform(flights_test)
prediction.groupBy('label','prediction').count().show()

In [0]:
TN = prediction.filter('prediction = 0 AND label = prediction').count()
TP = prediction.filter('prediction = 1 AND label = prediction').count()
FN = prediction.filter('prediction = 0 AND label != prediction').count()
FP = prediction.filter('prediction = 1 AND label != prediction').count()

precision = TP/(TP+FP)
recall = TP/(TP+FN)
print(precision,recall)

from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator

multi_evaluator = MulticlassClassificationEvaluator()
weighted_precision = multi_evaluator.evaluate(prediction,{multi_evaluator.metricName:"weightedPrecision"})

binary_evaluator = BinaryClassificationEvaluator()
auc = binary_evaluator.evaluate(prediction,{binary_evaluator.metricName:"areaUnderROC"})

print(weighted_precision,auc)

In [0]:
from pyspark.ml.feature import OneHotEncoder

onehot = OneHotEncoder(inputCols=['org_idx'], outputCols=['org_dummy'])
onehot = onehot.fit(flights_assembled)
flights_onehot = onehot.transform(flights_assembled)

flights_onehot.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx').show()

In [0]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

regression = LinearRegression(labelCol='duration').fit(flights_train)
predictions = regression.transform(flights_test)
predictions.select('duration', 'prediction').show(10)
RegressionEvaluator(labelCol='duration').evaluate(predictions)

In [0]:
inter = regression.intercept
print(inter)
coefs = regression.coefficients
print(coefs)

minutes_per_km = regression.coefficients[0]
print(minutes_per_km)

avg_speed = 60 / minutes_per_km
print(avg_speed)

In [0]:
from pyspark.ml.feature import Bucketizer, OneHotEncoder

buckets = Bucketizer(splits=[0, 3, 6, 9, 12, 15, 18, 21, 24], inputCol='depart', outputCol='depart_bucket')

bucketed = buckets.transform(flights)
bucketed.select('depart', 'depart_bucket').show(5)

onehot = OneHotEncoder(inputCols=['depart_bucket'], outputCols=['depart_dummy'])

flights_onehot = onehot.fit(bucketed).transform(bucketed)
flights_onehot.select('depart', 'depart_bucket', 'depart_dummy').show(5)

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator
RegressionEvaluator(labelCol='duration').evaluate(predictions)

avg_eve_ogg = regression.intercept
print(avg_eve_ogg)

avg_night_ogg = regression.intercept + regression.coefficients[6]
print(avg_night_ogg)

avg_night_jfk = regression.intercept + regression.coefficients[6] + regression.coefficients[3]
print(avg_night_jfk)

In [0]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

regression = LinearRegression(labelCol='duration', regParam=1, elasticNetParam=1).fit(flights_train)
prediction = regression.transform(flights_test)
rmse = RegressionEvaluator(labelCol='duration').evaluate(prediction)
print("The test RMSE is", rmse)

coeffs = regression.coefficients
print(coeffs)

In [0]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

regression = LinearRegression(labelCol='duration', regParam=1, elasticNetParam=1).fit(flights_train)

rmse = RegressionEvaluator(labelCol='duration').evaluate(regression.transform(flights_test))
print("The test RMSE is", rmse)

coeffs = regression.coefficients
print(coeffs)

zero_coeff = sum([beta == 0 for beta in regression.coefficients])
print("Number of coefficients equal to 0:", zero_coeff)

In [0]:
indexer = StringIndexer(inputCol='org', outputCol='org_idx')
onehot = OneHotEncoder(inputCols=['org_idx', 'dow'], outputCols=['org_dummy', 'dow_dummy'])

assembler = VectorAssembler(inputCols=['km', 'org_dummy', 'dow_dummy'], outputCol='features')

regression = LinearRegression(labelCol='duration')

In [0]:
from pyspark.ml import Pipeline

flights = spark.read.csv('/FileStore/tables/flights3.csv',header=True,inferSchema=True)
flights = flights.withColumn('km', round(flights.mile * 1.60934, 0)).drop('mile')
train,test = flights.randomSplit([0.8, 0.2], seed=17)

pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])
pipeline = pipeline.fit(train)
predictions = pipeline.transform(test)

In [0]:
from pyspark.ml.tuning import ParamGridBuilder,CrossValidator

params = ParamGridBuilder().build()

regression = LinearRegression(labelCol='duration')
evaluator = RegressionEvaluator(labelCol='duration')

cv = CrossValidator(estimator=regression, estimatorParamMaps=params, evaluator=evaluator,numFolds=5)
cv = cv.fit(flights_train)
pred = cv.transform(flights_test)

In [0]:
indexer = StringIndexer(inputCol='org', outputCol='org_idx')
onehot = OneHotEncoder(inputCols=['org_idx'], outputCols=['org_dummy'])
assembler = VectorAssembler(inputCols=['km', 'org_dummy'], outputCol='features')

pipeline= Pipeline(stages =[indexer, onehot, assembler, regression])
cv = CrossValidator(estimator =pipeline, estimatorParamMaps =params,evaluator =evaluator)

In [0]:
params = ParamGridBuilder()
params = params.addGrid(regression.regParam, [0.01, 0.1, 1.0, 10.0]).addGrid(regression.elasticNetParam, [0.0, 0.5, 1.0])
params = params.build()
print('Number of models to be tested: ', len(params))

cv = CrossValidator(estimator =pipeline, estimatorParamMaps=params, evaluator=evaluator, numFolds=5)

In [0]:
model = cv.fit(train)
best_model = model.bestModel
print(best_model.stages)
best_model.stages[3].extractParamMap()

predictions = best_model.transform(test)
evaluator.evaluate(predictions)

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

tree = DecisionTreeClassifier().fit(flights_train)
gbt = GBTClassifier().fit(flights_train)
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(tree.transform(flights_test))
evaluator.evaluate(gbt.transform(flights_test))

print(gbt.getNumTrees)
print(gbt.featureImportances)

In [0]:
from pyspark.ml.classification import RandomForestClassifier
forest = RandomForestClassifier()

params = ParamGridBuilder() \
.addGrid(forest.featureSubsetStrategy, ['all', 'onethird', 'sqrt', 'log2']) \
.addGrid(forest.maxDepth, [2, 5, 10]) \
.build()

evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=forest, estimatorParamMaps=params, evaluator=evaluator, numFolds=5)

In [0]:
# Model
cv = cv.fit(flights_train)

# Average AUC for each parameter combination in grid
print(cv.avgMetrics)

# Average AUC for the best model
print(max(cv.avgMetrics))

# What's the optimal parameter value for maxDepth?
print(cv.bestModel.explainParam('maxDepth'))

# What's the optimal parameter value for featureSubsetStrategy?
print(cv.bestModel.explainParam('featureSubsetStrategy'))

# AUC for best model on testing data
print(evaluator.evaluate(cv.transform(flights_test)))