In [1]:
#import modules
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler

#create Spark session
appName = "Classification in Spark"
spark = SparkSession \
    .builder \
    .appName(appName) \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

2024-03-19 09:31:52,010 WARN util.Utils: Your hostname, BDS-2023 resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
2024-03-19 09:31:52,050 WARN util.Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2024-03-19 09:32:16,279 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Define nosso esquema
flightSchema = StructType([
  StructField("DayofMonth", IntegerType(), False),
  StructField("DayOfWeek", IntegerType(), False),
  StructField("Carrier", StringType(), False),
  StructField("OriginAirportID", IntegerType(), False),
  StructField("DestAirportID", IntegerType(), False),
  StructField("DepDelay", IntegerType(), False),
  StructField("ArrDelay", IntegerType(), False),
])

# Leia os dados do CSV com nosso esquema definido
# Atualize o caminho conforme o caminho correto no HDFS
caminho_no_hdfs = 'hdfs:///user/hduser/dataset/flights.csv'  # Ajuste conforme a necessidade
flightDataFrame = spark.read.csv(caminho_no_hdfs, schema=flightSchema, header=True)

# Exibe as primeiras 3 linhas
flightDataFrame.show(3)


                                                                                

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
|        19|        5|     DL|          14057|        14869|      -4|     -15|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 3 rows



In [10]:
from pyspark.sql.functions import col

# Substitua `csv` por `flightDataFrame`, assumindo que `flightDataFrame` é o seu DataFrame carregado
data = flightDataFrame.select(
    "DayofMonth", "DayOfWeek", "OriginAirportID", "DestAirportID", 
    "DepDelay", ((col("ArrDelay") > 15).cast("Integer").alias("Late"))
)
data.show(3)


                                                                                

+----------+---------+---------------+-------------+--------+----+
|DayofMonth|DayOfWeek|OriginAirportID|DestAirportID|DepDelay|Late|
+----------+---------+---------------+-------------+--------+----+
|        19|        5|          11433|        13303|      -3|   0|
|        19|        5|          14869|        12478|       0|   0|
|        19|        5|          14057|        14869|      -4|   0|
+----------+---------+---------------+-------------+--------+----+
only showing top 3 rows



In [11]:
#divide data, 70% for training, 30% for testing
dividedData = data.randomSplit([0.7, 0.3]) 
trainingData = dividedData[0] #index 0 = data training
testingData = dividedData[1] #index 1 = data testing
train_rows = trainingData.count()
test_rows = testingData.count()
print ("Training data rows:", train_rows, "; Testing data rows:", test_rows)

                                                                                

Training data rows: 1890699 ; Testing data rows: 811519


In [12]:
#define an assembler
assembler = VectorAssembler(inputCols = [
    "DayofMonth", "DayOfWeek", "OriginAirportID", "DestAirportID", 
    "DepDelay"], outputCol="features")
trainingDataFinal = assembler.transform(
    trainingData).select(col("features"), col("Late").alias("label"))
trainingDataFinal.show(truncate=False, n=2)

                                                                                

+------------------------------+-----+
|features                      |label|
+------------------------------+-----+
|[1.0,1.0,10140.0,10397.0,-4.0]|0    |
|[1.0,1.0,10140.0,10821.0,8.0] |0    |
+------------------------------+-----+
only showing top 2 rows



In [13]:
#define our classifier
classifier = LogisticRegression(
    labelCol="label",featuresCol="features",maxIter=10,regParam=0.3)
#train our classifier
model = classifier.fit(trainingDataFinal)
print ("Classifier model is trained!")

                                                                                

Classifier model is trained!


In [14]:
testingDataFinal = assembler.transform(
    testingData).select(col("features"), col("Late").alias("trueLabel"))
testingDataFinal.show(3)

[Stage 29:>                                                         (0 + 1) / 1]

+--------------------+---------+
|            features|trueLabel|
+--------------------+---------+
|[1.0,1.0,10140.0,...|        0|
|[1.0,1.0,10140.0,...|        0|
|[1.0,1.0,10140.0,...|        0|
+--------------------+---------+
only showing top 3 rows



                                                                                

In [15]:
prediction = model.transform(testingDataFinal)
predictionFinal = prediction.select(
    "features", "prediction", "probability", "trueLabel")
predictionFinal.show(truncate=False, n=3)
prediction.show(truncate=False, n=3)

                                                                                

+------------------------------+----------+----------------------------------------+---------+
|features                      |prediction|probability                             |trueLabel|
+------------------------------+----------+----------------------------------------+---------+
|[1.0,1.0,10140.0,10397.0,-2.0]|0.0       |[0.8291991375897401,0.17080086241025993]|0        |
|[1.0,1.0,10140.0,11259.0,-2.0]|0.0       |[0.8292782877184747,0.17072171228152533]|0        |
|[1.0,1.0,10140.0,11259.0,-1.0]|0.0       |[0.8272920742950405,0.17270792570495952]|0        |
+------------------------------+----------+----------------------------------------+---------+
only showing top 3 rows



                                                                                

+------------------------------+---------+----------------------------------------+----------------------------------------+----------+
|features                      |trueLabel|rawPrediction                           |probability                             |prediction|
+------------------------------+---------+----------------------------------------+----------------------------------------+----------+
|[1.0,1.0,10140.0,10397.0,-2.0]|0        |[1.5799620099279985,-1.5799620099279985]|[0.8291991375897401,0.17080086241025993]|0.0       |
|[1.0,1.0,10140.0,11259.0,-2.0]|0        |[1.5805209723899332,-1.5805209723899332]|[0.8292782877184747,0.17072171228152533]|0.0       |
|[1.0,1.0,10140.0,11259.0,-1.0]|0        |[1.566555928937473,-1.566555928937473]  |[0.8272920742950405,0.17270792570495952]|0.0       |
+------------------------------+---------+----------------------------------------+----------------------------------------+----------+
only showing top 3 rows



In [16]:
correctPrediction = predictionFinal.filter(
    predictionFinal['prediction'] == predictionFinal['trueLabel']).count()
totalData = predictionFinal.count()
print("correct prediction:", correctPrediction, ", total data:", totalData, 
      ", accuracy:", correctPrediction/totalData)



correct prediction: 669341 , total data: 811519 , accuracy: 0.8248001587147066


                                                                                

In [17]:
from pyspark.ml.classification import RandomForestClassifier

model2 = RandomForestClassifier(
    numTrees=3, maxDepth=5, seed=42, labelCol="label",featuresCol="features")
model2 = model2.fit(trainingDataFinal)
print ("Model is trained!")

2024-03-19 11:11:12,658 WARN memory.MemoryStore: Not enough space to cache rdd_151_5 in memory! (computed 19.7 MiB so far)
2024-03-19 11:11:12,867 WARN memory.MemoryStore: Not enough space to cache rdd_151_0 in memory! (computed 12.7 MiB so far)
2024-03-19 11:11:13,705 WARN memory.MemoryStore: Not enough space to cache rdd_151_7 in memory! (computed 19.7 MiB so far)
2024-03-19 11:11:13,785 WARN memory.MemoryStore: Not enough space to cache rdd_151_3 in memory! (computed 19.7 MiB so far)
2024-03-19 11:11:13,803 WARN memory.MemoryStore: Not enough space to cache rdd_151_6 in memory! (computed 12.7 MiB so far)
2024-03-19 11:11:13,806 WARN storage.BlockManager: Persisting block rdd_151_7 to disk instead.
2024-03-19 11:11:13,816 WARN storage.BlockManager: Persisting block rdd_151_3 to disk instead.
2024-03-19 11:11:13,827 WARN storage.BlockManager: Persisting block rdd_151_0 to disk instead.
2024-03-19 11:11:13,826 WARN storage.BlockManager: Persisting block rdd_151_5 to disk instead.
2024-

Model is trained!


In [18]:
prediction = model2.transform(testingDataFinal)
predictionFinal = prediction.select(
    "features", "prediction", "probability", "trueLabel")
predictionFinal.show(truncate=False, n=3)
correctPrediction = predictionFinal.filter(
    predictionFinal['prediction'] == predictionFinal['trueLabel']).count()
totalData = predictionFinal.count()
print("correct prediction:", correctPrediction, ", total data:", 
      totalData, ", accuracy", correctPrediction/totalData)

                                                                                

+------------------------------+----------+----------------------------------------+---------+
|features                      |prediction|probability                             |trueLabel|
+------------------------------+----------+----------------------------------------+---------+
|[1.0,1.0,10140.0,10397.0,-2.0]|0.0       |[0.9302961833555337,0.06970381664446638]|0        |
|[1.0,1.0,10140.0,11259.0,-2.0]|0.0       |[0.9302961833555337,0.06970381664446638]|0        |
|[1.0,1.0,10140.0,11259.0,-1.0]|0.0       |[0.9302961833555337,0.06970381664446638]|0        |
+------------------------------+----------+----------------------------------------+---------+
only showing top 3 rows



[Stage 61:>                                                         (0 + 0) / 1]

correct prediction: 752020 , total data: 811519 , accuracy 0.9266819384389029


                                                                                