In [1]:
import pyspark
pyspark.__version__

'3.5.1'

In [2]:
from pyspark.sql import SparkSession

In [3]:
# Create SparkSession object
spark = SparkSession.builder.master("local[1]").appName('test').getOrCreate()

24/08/06 07:26:18 WARN Utils: Your hostname, codespaces-648539 resolves to a loopback address: 127.0.0.1; using 10.0.3.203 instead (on interface eth0)
24/08/06 07:26:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/06 07:26:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [31]:
# Terminate the cluster
spark.stop()

In [5]:
spark.version

'3.5.1'

### Read a csv file

In [40]:
# Read data from CSV file
df = spark.read.csv('flights.csv',
                         sep=',',
                         header=True,
                         inferSchema=True,
                         nullValue='NA')

# Get number of records
print("The data contain %d records." % df.count())

# View the first five records
display(df.show(5))

# Check column data types
print(df.dtypes)

The data contain 15001 records.
+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 10| 10|  1|     OO|  5836|ORD| 157|  8.18|      51|   27|
|  1|  4|  1|     OO|  5866|ORD| 466|  15.5|     102| NULL|
| 11| 22|  1|     OO|  6016|ORD| 738|  7.17|     127|  -19|
|  2| 14|  5|     B6|   199|JFK|2248| 21.17|     365|   60|
|  5| 25|  3|     WN|  1675|SJC| 386| 12.92|      85|   22|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 5 rows



None

[('mon', 'int'), ('dom', 'int'), ('dow', 'int'), ('carrier', 'string'), ('flight', 'int'), ('org', 'string'), ('mile', 'int'), ('depart', 'double'), ('duration', 'int'), ('delay', 'int')]


In [30]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
# from pyspark.sql.types import *

# Specify column names and types
schema = StructType([
    StructField("mon", FloatType()),
    StructField("dom", IntegerType()),
    StructField("dow", IntegerType()),
    StructField("carrier", StringType()),
    StructField("depart", IntegerType()),
    StructField("flight", StringType())
])

# Load data from a delimited file
df = spark.read.csv('flights.csv', sep=',', header=True, schema=schema)

# Print schema of DataFrame
display(df.printSchema())

# View the first five records
display(df.show(5))

root
 |-- mon: float (nullable = true)
 |-- dom: integer (nullable = true)
 |-- dow: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- depart: integer (nullable = true)
 |-- flight: string (nullable = true)



None

+----+---+---+-------+------+------+
| mon|dom|dow|carrier|depart|flight|
+----+---+---+-------+------+------+
|10.0| 10|  1|     OO|  5836|   ORD|
| 1.0|  4|  1|     OO|  5866|   ORD|
|11.0| 22|  1|     OO|  6016|   ORD|
| 2.0| 14|  5|     B6|   199|   JFK|
| 5.0| 25|  3|     WN|  1675|   SJC|
+----+---+---+-------+------+------+
only showing top 5 rows



24/08/06 06:20:56 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 10, schema size: 6
CSV file: file:///workspaces/data_science_materials/pyspark/flights.csv


None

### Drop Column & Remove Nulls

In [41]:
# Remove the 'flight' column
df_drop_column = df.drop('flight')
print(df_drop_column.count())

# Number of records with missing 'delay' values
print(df_drop_column.filter('delay IS NULL').count())

# Remove records with missing 'delay' values
df_valid_delay = df_drop_column.filter('delay IS NOT NULL')
print(df_valid_delay.count())

# Remove records with missing values in any column and get the number of remaining rows
df_none_missing = df_valid_delay.dropna()
print(df_none_missing.count())

15001
915
14086
14086


In [42]:
# Import the required function
from pyspark.sql.functions import round

# Convert 'mile' to 'km' and drop 'mile' column (1 mile is equivalent to 1.60934 km)
df_indexed = df_none_missing.withColumn('km', round(df.mile * 1.60934, 0)) \
                    .drop('mile')

# Create 'label' column indicating whether flight delayed (1) or not (0)
df_indexed = df_indexed.withColumn('label', (df_indexed.delay >= 15).cast('integer'))

# Check first five records
df_indexed.show(5)

+---+---+---+-------+---+------+--------+-----+------+-----+
|mon|dom|dow|carrier|org|depart|duration|delay|    km|label|
+---+---+---+-------+---+------+--------+-----+------+-----+
| 10| 10|  1|     OO|ORD|  8.18|      51|   27| 253.0|    1|
| 11| 22|  1|     OO|ORD|  7.17|     127|  -19|1188.0|    0|
|  2| 14|  5|     B6|JFK| 21.17|     365|   60|3618.0|    1|
|  5| 25|  3|     WN|SJC| 12.92|      85|   22| 621.0|    1|
|  3| 28|  1|     B6|LGA| 13.33|     182|   70|1732.0|    1|
+---+---+---+-------+---+------+--------+-----+------+-----+
only showing top 5 rows



### Categorical to Numerical Encoding

In [43]:
from pyspark.ml.feature import StringIndexer

# Create an indexer
indexer = StringIndexer(inputCol='carrier', outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(df_indexed)

# Indexer creates a new column with numeric index values
df_indexed = indexer_model.transform(df_indexed)

# Repeat the process for the other categorical feature
df_indexed = StringIndexer(inputCol='org', outputCol='org_idx').fit(df_indexed).transform(df_indexed)
df_indexed.show(5)

+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+
|mon|dom|dow|carrier|org|depart|duration|delay|    km|label|carrier_idx|org_idx|
+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+
| 10| 10|  1|     OO|ORD|  8.18|      51|   27| 253.0|    1|        2.0|    0.0|
| 11| 22|  1|     OO|ORD|  7.17|     127|  -19|1188.0|    0|        2.0|    0.0|
|  2| 14|  5|     B6|JFK| 21.17|     365|   60|3618.0|    1|        4.0|    2.0|
|  5| 25|  3|     WN|SJC| 12.92|      85|   22| 621.0|    1|        3.0|    4.0|
|  3| 28|  1|     B6|LGA| 13.33|     182|   70|1732.0|    1|        4.0|    3.0|
+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+
only showing top 5 rows



### Extracting train and test features

In [44]:
# Import the necessary class
from pyspark.ml.feature import VectorAssembler

# Create an assembler object
assembler = VectorAssembler(inputCols=[
    'mon', 'dom', 'dow', 'carrier_idx', 'org_idx', 'km', 'depart', 'duration', 'label', 'delay'
], outputCol='features')

# Consolidate predictor columns
df_assembled = assembler.transform(df_indexed)

# Check the resulting column
# df_assembled = df_assembled.select('features', 'delay')
df_assembled.select('features', 'delay').show(5, truncate=False)

+---------------------------------------------------+-----+
|features                                           |delay|
+---------------------------------------------------+-----+
|[10.0,10.0,1.0,2.0,0.0,253.0,8.18,51.0,1.0,27.0]   |27   |
|[11.0,22.0,1.0,2.0,0.0,1188.0,7.17,127.0,0.0,-19.0]|-19  |
|[2.0,14.0,5.0,4.0,2.0,3618.0,21.17,365.0,1.0,60.0] |60   |
|[5.0,25.0,3.0,3.0,4.0,621.0,12.92,85.0,1.0,22.0]   |22   |
|[3.0,28.0,1.0,4.0,3.0,1732.0,13.33,182.0,1.0,70.0] |70   |
+---------------------------------------------------+-----+
only showing top 5 rows



### Train-test Split

In [49]:
# Split into training and testing sets in a 80:20 ratio
df_train, df_test = df_assembled.randomSplit([0.8, 0.2], seed=43)
print(df_train.count(), df_test.count())

# Check that training set has around 80% of records
training_ratio = df_train.count() / df_assembled.count()
print(training_ratio)
df_train.show(5, truncate=False)

11320 2766
0.8036348147096408
+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+------------------------------------------------+
|mon|dom|dow|carrier|org|depart|duration|delay|km    |label|carrier_idx|org_idx|features                                        |
+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+------------------------------------------------+
|0  |1  |2  |AA     |ORD|8.42  |155     |83   |1617.0|1    |1.0        |0.0    |[0.0,1.0,2.0,1.0,0.0,1617.0,8.42,155.0,1.0,83.0]|
|0  |1  |2  |AA     |ORD|15.25 |115     |20   |941.0 |1    |1.0        |0.0    |[0.0,1.0,2.0,1.0,0.0,941.0,15.25,115.0,1.0,20.0]|
|0  |1  |2  |AA     |ORD|15.5  |90      |25   |649.0 |1    |1.0        |0.0    |[0.0,1.0,2.0,1.0,0.0,649.0,15.5,90.0,1.0,25.0]  |
|0  |1  |2  |AA     |ORD|16.0  |135     |54   |1395.0|1    |1.0        |0.0    |[0.0,1.0,2.0,1.0,0.0,1395.0,16.0,135.0,1.0,54.0]|
|0  |1  |2  |AA     |ORD|21.5  |65      |133  |415.0 |1    |

## Decision Tree

### Model Fit

In [48]:
# Import the Decision Tree Classifier class
from pyspark.ml.classification import DecisionTreeClassifier

# Create a classifier object and fit to the training data
tree = DecisionTreeClassifier()
tree_model = tree.fit(df_assembled)

# Create predictions for the testing data and take a look at the predictions
prediction = tree_model.transform(df_assembled)
prediction.select('label', 'prediction', 'probability').show(5, False)

                                                                                

+-----+----------+-----------+
|label|prediction|probability|
+-----+----------+-----------+
|1    |1.0       |[0.0,1.0]  |
|0    |0.0       |[1.0,0.0]  |
|1    |1.0       |[0.0,1.0]  |
|1    |1.0       |[0.0,1.0]  |
|1    |1.0       |[0.0,1.0]  |
+-----+----------+-----------+
only showing top 5 rows



### Confusion matrix

In [51]:
# Create a confusion matrix
prediction.groupBy('label', 'prediction').count().show()

# Calculate the elements of the confusion matrix
TN = prediction.filter('prediction = 0 AND label = prediction').count()
TP = prediction.filter('prediction = 1 AND label = prediction').count()
FN = prediction.filter('prediction = 0 AND label = 1').count()
FP = prediction.filter('prediction = 1 AND label = 0').count()

# Accuracy measures the proportion of correct predictions
accuracy = (TP + TN)/ (TP + TN + FP + FN)
print(accuracy)

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    0|       0.0| 6924|
|    1|       1.0| 7162|
+-----+----------+-----+

1.0


## Logistic Regression

### Model fit & test

In [52]:
# Import the logistic regression class
from pyspark.ml.classification import LogisticRegression

# Create a classifier object and train on training data
logistic = LogisticRegression().fit(df_train)

# Create predictions for the testing data and show confusion matrix
prediction = logistic.transform(df_test)

# Create a confusion matrix
prediction.groupBy('label', 'prediction').count().show()

24/08/06 08:47:22 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/08/06 08:47:22 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    0|       0.0| 1380|
|    1|       1.0| 1386|
+-----+----------+-----+



In [61]:
# Calculate the elements of the confusion matrix
TN = prediction.filter('prediction = 0 AND label = prediction').count()
TP = prediction.filter('prediction = 1 AND label = prediction').count()
FN = prediction.filter('prediction = 0 AND label = 1').count()
FP = prediction.filter('prediction = 1 AND label = 0').count()

In [62]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

# Calculate precision and recall
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print('precision = {:.2f}\nrecall    = {:.2f}'.format(precision, recall))

# Find weighted precision
multi_evaluator = MulticlassClassificationEvaluator()
weighted_precision = multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: "weightedPrecision"})

# Find AUC
binary_evaluator = BinaryClassificationEvaluator()
auc = binary_evaluator.evaluate(prediction, {binary_evaluator.metricName: "areaUnderROC"})

precision = 1.00
recall    = 1.00


## Turning Text into Tables