In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("LogisticRegression").getOrCreate()

In [3]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [8]:
my_data=spark.read.csv("C:/Users/User/Desktop/Data/flights.csv", header=True, inferSchema=True)

In [9]:
my_data.show(5)

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
|        19|        5|     DL|          14057|        14869|      -4|     -15|
|        19|        5|     DL|          15016|        11433|      28|      24|
|        19|        5|     DL|          11193|        12892|      -6|     -11|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 5 rows



In [10]:
my_data.printSchema()

root
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- Carrier: string (nullable = true)
 |-- OriginAirportID: integer (nullable = true)
 |-- DestAirportID: integer (nullable = true)
 |-- DepDelay: integer (nullable = true)
 |-- ArrDelay: integer (nullable = true)



In [19]:
flightschema=StructType([
   StructField ("DayofMonth", IntegerType(),False),
    StructField("DayofWeek", IntegerType(),False),
    StructField("Carrier", StringType(),False),
   StructField ("OriginAirportID", IntegerType(),False),
    StructField("DestAirportID", IntegerType(),False),
    StructField("DepDelay", IntegerType(),False),
    StructField("ArrDelay", IntegerType(),False)
]
)

In [20]:
df=spark.read.csv("C:/Users/User/Desktop/Data/flights.csv", header=True, schema=flightschema)

In [21]:
df.show(5)

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayofWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
|        19|        5|     DL|          14057|        14869|      -4|     -15|
|        19|        5|     DL|          15016|        11433|      28|      24|
|        19|        5|     DL|          11193|        12892|      -6|     -11|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 5 rows



In [22]:
df.printSchema()

root
 |-- DayofMonth: integer (nullable = true)
 |-- DayofWeek: integer (nullable = true)
 |-- Carrier: string (nullable = true)
 |-- OriginAirportID: integer (nullable = true)
 |-- DestAirportID: integer (nullable = true)
 |-- DepDelay: integer (nullable = true)
 |-- ArrDelay: integer (nullable = true)



# select Some import data for Classification features and change arrival delay into binary class 
* late
* not late

In [27]:
df1=df.select("DayofMonth","DayofWeek","originAirportID","DestAirportID","DepDelay",\
              ((col("ArrDelay") > 15).cast("Int").alias("Late")))

In [29]:
df1.show()

+----------+---------+---------------+-------------+--------+----+
|DayofMonth|DayofWeek|originAirportID|DestAirportID|DepDelay|Late|
+----------+---------+---------------+-------------+--------+----+
|        19|        5|          11433|        13303|      -3|   0|
|        19|        5|          14869|        12478|       0|   0|
|        19|        5|          14057|        14869|      -4|   0|
|        19|        5|          15016|        11433|      28|   1|
|        19|        5|          11193|        12892|      -6|   0|
|        19|        5|          10397|        15016|      -1|   0|
|        19|        5|          15016|        10397|       0|   0|
|        19|        5|          10397|        14869|      15|   1|
|        19|        5|          10397|        10423|      33|   1|
|        19|        5|          11278|        10397|     323|   1|
|        19|        5|          14107|        13487|      -7|   0|
|        19|        5|          11433|        11298|      22| 

# Dividing Data into Train and Test

In [30]:
train_data,test_data=df1.randomSplit([0.7,0.3])

In [31]:
train_data.count()

1892514

In [32]:
test_data.count()

809704

# Preparing Data

In [38]:
# Vector Assembler
assembler=VectorAssembler(inputCols=["DayofMonth","DayofWeek","originAirportID","DestAirportID","DepDelay"]\
                          ,outputCol="features")

Exception ignored in: <function JavaWrapper.__del__ at 0x000001C1139B0160>
Traceback (most recent call last):
  File "C:\spark\python\pyspark\ml\wrapper.py", line 39, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'VectorAssembler' object has no attribute '_java_obj'


In [39]:
tran_data=assembler.transform(df1)

In [40]:
tran_data.show(5)

+----------+---------+---------------+-------------+--------+----+--------------------+
|DayofMonth|DayofWeek|originAirportID|DestAirportID|DepDelay|Late|            features|
+----------+---------+---------------+-------------+--------+----+--------------------+
|        19|        5|          11433|        13303|      -3|   0|[19.0,5.0,11433.0...|
|        19|        5|          14869|        12478|       0|   0|[19.0,5.0,14869.0...|
|        19|        5|          14057|        14869|      -4|   0|[19.0,5.0,14057.0...|
|        19|        5|          15016|        11433|      28|   1|[19.0,5.0,15016.0...|
|        19|        5|          11193|        12892|      -6|   0|[19.0,5.0,11193.0...|
+----------+---------+---------------+-------------+--------+----+--------------------+
only showing top 5 rows



# Final DataSet

In [44]:
tran_data=tran_data.select("features",tran_data["Late"].alias("label"))

In [45]:
tran_data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[19.0,5.0,11433.0...|    0|
|[19.0,5.0,14869.0...|    0|
|[19.0,5.0,14057.0...|    0|
|[19.0,5.0,15016.0...|    1|
|[19.0,5.0,11193.0...|    0|
+--------------------+-----+
only showing top 5 rows



In [46]:
train_data,test_data=tran_data.randomSplit([0.7,0.3])

In [47]:
train_data.count()

1891046

In [48]:
test_data.count()

811172

In [49]:
train_data.show(2)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1.0,1.0,10140.0,...|    0|
|[1.0,1.0,10140.0,...|    0|
+--------------------+-----+
only showing top 2 rows



# Training Data

In [51]:
lr=LogisticRegression(featuresCol="features",labelCol="label",predictionCol="prediction",maxIter=10, regParam=0.3)

In [52]:
lrmodel=lr.fit(train_data)
print("Model is trained")

Model is trained


In [55]:
lrmodel.transform(train_data).show(10)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[1.0,1.0,10140.0,...|    0|[1.59393660791932...|[0.83116923559442...|       0.0|
|[1.0,1.0,10140.0,...|    0|[1.42734379461425...|[0.80648711078906...|       0.0|
|[1.0,1.0,10140.0,...|    0|[1.58243677587768...|[0.82954934807100...|       0.0|
|[1.0,1.0,10140.0,...|    0|[1.55446760727218...|[0.82555805675526...|       0.0|
|[1.0,1.0,10140.0,...|    0|[1.54048302296942...|[0.82353493157554...|       0.0|
|[1.0,1.0,10140.0,...|    1|[1.24680675261159...|[0.77674660533494...|       0.0|
|[1.0,1.0,10140.0,...|    1|[1.05102257237304...|[0.74097121279383...|       0.0|
|[1.0,1.0,10140.0,...|    0|[1.56854731550140...|[0.82757641798513...|       0.0|
|[1.0,1.0,10140.0,...|    0|[1.55456273119865...|[0.82557175531333...|       0.0|
|[1.0,1.0,10140.

In [56]:
lrmodel.coefficients

DenseVector([0.0005, -0.0042, -0.0, -0.0, 0.014])

In [62]:
# Grab the Correct prediction
train_pred=lrmodel.transform(train_data)
train_pred.show(5)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[1.0,1.0,10140.0,...|    0|[1.59393660791932...|[0.83116923559442...|       0.0|
|[1.0,1.0,10140.0,...|    0|[1.42734379461425...|[0.80648711078906...|       0.0|
|[1.0,1.0,10140.0,...|    0|[1.58243677587768...|[0.82954934807100...|       0.0|
|[1.0,1.0,10140.0,...|    0|[1.55446760727218...|[0.82555805675526...|       0.0|
|[1.0,1.0,10140.0,...|    0|[1.54048302296942...|[0.82353493157554...|       0.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



In [65]:
correct_prediction=train_pred.filter(train_pred["label"]==train_pred["prediction"]).count()

In [69]:
print("Accuracy for training-data :,",correct_prediction/(train_data.count()))

Accuracy for training-data :, 0.8244696321506721


# testing Data

In [72]:
test=lrmodel.transform(test_data).show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[1.0,1.0,10140.0,...|    0|[1.56596743931381...|[0.82720797476176...|       0.0|
|[1.0,1.0,10140.0,...|    0|[1.56845219157493...|[0.82756284397806...|       0.0|
|[1.0,1.0,10140.0,...|    0|[1.54048302296942...|[0.82353493157554...|       0.0|
|[1.0,1.0,10140.0,...|    1|[1.24680675261159...|[0.77674660533494...|       0.0|
|[1.0,1.0,10140.0,...|    0|[1.59651648410691...|[0.83153095247597...|       0.0|
|[1.0,1.0,10140.0,...|    0|[1.48463980968488...|[0.81527237390204...|       0.0|
|[1.0,1.0,10140.0,...|    0|[1.68044128518279...|[0.84296295570456...|       0.0|
|[1.0,1.0,10140.0,...|    0|[1.68044128518279...|[0.84296295570456...|       0.0|
|[1.0,1.0,10140.0,...|    0|[1.59653377936627...|[0.83153337530701...|       0.0|
|[1.0,1.0,10140.

In [75]:
correct_prediction_test=test.filter(test["label"]==test["prediction"]).count()

AttributeError: 'NoneType' object has no attribute 'filter'

In [74]:
print("Accuracy for test-data :,",correct_prediction_test/(test_data.count()))

Accuracy for test-data :, 0.8244696321506721
