In [2]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("RandomForest").getOrCreate()

In [4]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [5]:
my_data=spark.read.csv("C:/Users/User/Desktop/Data/flights.csv", header=True, inferSchema=True)

In [6]:
my_data.show(5)

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
|        19|        5|     DL|          14057|        14869|      -4|     -15|
|        19|        5|     DL|          15016|        11433|      28|      24|
|        19|        5|     DL|          11193|        12892|      -6|     -11|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 5 rows



In [7]:
my_data.printSchema()

root
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- Carrier: string (nullable = true)
 |-- OriginAirportID: integer (nullable = true)
 |-- DestAirportID: integer (nullable = true)
 |-- DepDelay: integer (nullable = true)
 |-- ArrDelay: integer (nullable = true)



In [8]:
flightschema=StructType([
   StructField ("DayofMonth", IntegerType(),False),
    StructField("DayofWeek", IntegerType(),False),
    StructField("Carrier", StringType(),False),
   StructField ("OriginAirportID", IntegerType(),False),
    StructField("DestAirportID", IntegerType(),False),
    StructField("DepDelay", IntegerType(),False),
    StructField("ArrDelay", IntegerType(),False)
]
)

In [9]:
df=spark.read.csv("C:/Users/User/Desktop/Data/flights.csv", header=True, schema=flightschema)

In [10]:
df.show(5)

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayofWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
|        19|        5|     DL|          14057|        14869|      -4|     -15|
|        19|        5|     DL|          15016|        11433|      28|      24|
|        19|        5|     DL|          11193|        12892|      -6|     -11|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 5 rows



In [11]:
df.printSchema()

root
 |-- DayofMonth: integer (nullable = true)
 |-- DayofWeek: integer (nullable = true)
 |-- Carrier: string (nullable = true)
 |-- OriginAirportID: integer (nullable = true)
 |-- DestAirportID: integer (nullable = true)
 |-- DepDelay: integer (nullable = true)
 |-- ArrDelay: integer (nullable = true)



# select Some import data for Classification features and change arrival delay into binary class 
* late
* not late

In [12]:
df1=df.select("DayofMonth","DayofWeek","originAirportID","DestAirportID","DepDelay",\
              ((col("ArrDelay") > 15).cast("Int").alias("Late")))

In [13]:
df1.show()

+----------+---------+---------------+-------------+--------+----+
|DayofMonth|DayofWeek|originAirportID|DestAirportID|DepDelay|Late|
+----------+---------+---------------+-------------+--------+----+
|        19|        5|          11433|        13303|      -3|   0|
|        19|        5|          14869|        12478|       0|   0|
|        19|        5|          14057|        14869|      -4|   0|
|        19|        5|          15016|        11433|      28|   1|
|        19|        5|          11193|        12892|      -6|   0|
|        19|        5|          10397|        15016|      -1|   0|
|        19|        5|          15016|        10397|       0|   0|
|        19|        5|          10397|        14869|      15|   1|
|        19|        5|          10397|        10423|      33|   1|
|        19|        5|          11278|        10397|     323|   1|
|        19|        5|          14107|        13487|      -7|   0|
|        19|        5|          11433|        11298|      22| 

# Dividing Data into Train and Test

In [14]:
train_data,test_data=df1.randomSplit([0.7,0.3])

In [15]:
train_data.count()

1891501

In [16]:
test_data.count()

810717

# Preparing Data

In [17]:
# Vector Assembler
assembler=VectorAssembler(inputCols=["DayofMonth","DayofWeek","originAirportID","DestAirportID","DepDelay"]\
                          ,outputCol="features")

In [18]:
tran_data=assembler.transform(df1)

In [19]:
tran_data.show(5)

+----------+---------+---------------+-------------+--------+----+--------------------+
|DayofMonth|DayofWeek|originAirportID|DestAirportID|DepDelay|Late|            features|
+----------+---------+---------------+-------------+--------+----+--------------------+
|        19|        5|          11433|        13303|      -3|   0|[19.0,5.0,11433.0...|
|        19|        5|          14869|        12478|       0|   0|[19.0,5.0,14869.0...|
|        19|        5|          14057|        14869|      -4|   0|[19.0,5.0,14057.0...|
|        19|        5|          15016|        11433|      28|   1|[19.0,5.0,15016.0...|
|        19|        5|          11193|        12892|      -6|   0|[19.0,5.0,11193.0...|
+----------+---------+---------------+-------------+--------+----+--------------------+
only showing top 5 rows



# Final DataSet

In [20]:
tran_data=tran_data.select("features",tran_data["Late"].alias("label"))

In [21]:
tran_data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[19.0,5.0,11433.0...|    0|
|[19.0,5.0,14869.0...|    0|
|[19.0,5.0,14057.0...|    0|
|[19.0,5.0,15016.0...|    1|
|[19.0,5.0,11193.0...|    0|
+--------------------+-----+
only showing top 5 rows



In [22]:
train_data,test_data=tran_data.randomSplit([0.7,0.3])

In [23]:
train_data.count()

1891085

In [24]:
test_data.count()

811133

In [25]:
train_data.show(2)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1.0,1.0,10140.0,...|    0|
|[1.0,1.0,10140.0,...|    0|
+--------------------+-----+
only showing top 2 rows



# Training Data

In [27]:
lr=RandomForestClassifier(featuresCol="features",labelCol="label",predictionCol="prediction",\
                         numTrees=3,maxDepth=5,seed=42)

In [28]:
lrmodel=lr.fit(train_data)
print("Model is trained")

Model is trained


In [29]:
lrmodel.transform(train_data).show(10)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[1.0,1.0,10140.0,...|    0|[2.78219040214858...|[0.92739680071619...|       0.0|
|[1.0,1.0,10140.0,...|    0|[2.78219040214858...|[0.92739680071619...|       0.0|
|[1.0,1.0,10140.0,...|    0|[2.78219040214858...|[0.92739680071619...|       0.0|
|[1.0,1.0,10140.0,...|    0|[2.78219040214858...|[0.92739680071619...|       0.0|
|[1.0,1.0,10140.0,...|    1|[2.78219040214858...|[0.92739680071619...|       0.0|
|[1.0,1.0,10140.0,...|    1|[0.14076666798791...|[0.04692222266263...|       1.0|
|[1.0,1.0,10140.0,...|    0|[2.78219040214858...|[0.92739680071619...|       0.0|
|[1.0,1.0,10140.0,...|    0|[2.78219040214858...|[0.92739680071619...|       0.0|
|[1.0,1.0,10140.0,...|    0|[2.78219040214858...|[0.92739680071619...|       0.0|
|[1.0,1.0,10140.

In [31]:
# Grab the Correct prediction
train_pred=lrmodel.transform(train_data)
train_pred.show(5)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[1.0,1.0,10140.0,...|    0|[2.78219040214858...|[0.92739680071619...|       0.0|
|[1.0,1.0,10140.0,...|    0|[2.78219040214858...|[0.92739680071619...|       0.0|
|[1.0,1.0,10140.0,...|    0|[2.78219040214858...|[0.92739680071619...|       0.0|
|[1.0,1.0,10140.0,...|    0|[2.78219040214858...|[0.92739680071619...|       0.0|
|[1.0,1.0,10140.0,...|    1|[2.78219040214858...|[0.92739680071619...|       0.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



In [32]:
correct_prediction=train_pred.filter(train_pred["label"]==train_pred["prediction"]).count()

In [33]:
print("Accuracy for training-data :,",correct_prediction/(train_data.count()))

Accuracy for training-data :, 0.9263481017511112


# testing Data --RF

In [34]:
test=lrmodel.transform(test_data)

In [35]:
correct_prediction_test=test.filter(test["label"]==test["prediction"]).count()

In [36]:
print("Accuracy for test-data :,",correct_prediction_test/(test_data.count()))

Accuracy for test-data :, 0.926432533259034
