<a href="https://colab.research.google.com/github/kuldeep27396/SparkOptimization/blob/main/optimization_google_colab_part2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#install pyspark library
!pip install pyspark

In [None]:
#import pyspark
import pyspark

In [None]:
#import sparksession 
from pyspark.sql import SparkSession

In [None]:
#creating a sparksession object and providing appName 
spark=SparkSession.builder.appName("optimization").getOrCreate()

In [None]:
#To create dataframe form External datasets
AirlineDF = spark.read.option("header", "true").csv("/Users/krishnapratap/Desktop/partation/data/*")

# use take() in place of collect() for reduce time


In [None]:
%time AirlineDF.take(5)

In [None]:
%time AirlineDF.show(5)

In [None]:
# calculate total no. of flight by Unique Carriers
%time AirlineDF.select("UniqueCarrier").groupby("UniqueCarrier").count().show()

In [None]:
AirlineDF.registerTempTable("AirlineTable")

In [None]:
#how to see all columns with datatype
spark.sql("describe AirlineTable").show()

In [None]:
# calculate total no. of flight by Unique Carriers
%time spark.sql("select UniqueCarrier,count(UniqueCarrier) from AirlineTable group by UniqueCarrier").show()

In [None]:
#calculate total columns in dataframe
len(AirlineDF.columns)

# Follow process for optimization

# When data is huge otherwise not

# use coalesce() in place of repartition() to reduce the no. of partition 

# Process:- 1 reduce no. of partition

In [None]:
#to check how many partation in current AirlineDF dataframe
AirlineDF.rdd.getNumPartitions()

In [None]:
#Not Use repartition for reduce no. of partation
#AirlineDF1 = AirlineDF.repartition(2)

In [None]:
#Reduce No. of partation from 22 to 4
AirlineDF1 = AirlineDF.coalesce(4)

In [None]:
AirlineDF1.rdd.getNumPartitions()

In [None]:
#Again Check processing time
# calculate total no. of flight by Unique Carriers
%time AirlineDF1.select("UniqueCarrier").groupby("UniqueCarrier").count().show()

In [None]:
AirlineDF1.registerTempTable("AirlineTable1")

In [None]:
# calculate total no. of flight by Unique Carriers
%time spark.sql("select UniqueCarrier,count(UniqueCarrier) from AirlineTable1 group by UniqueCarrier").show()

# Apache Parquet is a columnar file format that provides optimizations to speed up queries and is a far more efficient file format than CSV or JSON, supported by many data processing systems.

# Process:- 2 write parquet file and create new dataframe for parquet file

In [None]:
# first write dataframe into parquet file
AirlineDF1.write.parquet("parquet/")

In [None]:
# Now create new dataframe from parquet file
AirlineDF_Par = spark.read.parquet("parquet/*")

In [None]:
AirlineDF_Par.take(5)

In [None]:
#Again Check processing time created dataframe from parquet file
# calculate total no. of flight by Unique Carriers
%time AirlineDF_Par.select("UniqueCarrier").groupby("UniqueCarrier").count().show()

In [None]:
AirlineDF_Par.registerTempTable("AirlineTable2")

In [None]:
# calculate total no. of flight by Unique Carriers
%time spark.sql("select UniqueCarrier,count(UniqueCarrier) from AirlineTable2 group by UniqueCarrier").show()

In [None]:
%time spark.sql("select rtrim(DestCityName) from AirlineTable2").show()

In [None]:
%time spark.sql("select substring_index(DestCityName, ',',1) as DestCityName from AirlineTable2").show()

# avoid PySpark UDF’s and use Spark SQL built-in functions as these functions provide optimization 

In [None]:
Flight_Destination = AirlineDF.select("DestAirportID", "DestAirportSeqID", "Dest", 
                                      "DestCityName","DestState","DestStateName")

In [None]:
Flight_Destination.show()

In [None]:
from pyspark.sql.functions import udf

In [None]:
from pyspark.sql.functions import col

In [None]:
def destination(str):
    resStr=""
    arr = str.split(",")
    for x in arr:
       resStr= arr[0]
    return resStr 

In [None]:
#Converting python function to UDF
destinationUDF = udf(lambda z: destination(z))

In [None]:
#before udf apply
Flight_Destination.select("Dest","DestCityName").show()

In [None]:
#when udf apply
%time Flight_Destination.select(col("Dest"),destinationUDF(col("DestCityName")).alias("DestCityName") ).show(truncate=False)

# Avoid UDF use spark sql

In [None]:
Flight_Destination.registerTempTable("AirlineTable3")

In [None]:
%time spark.sql("select Dest,substring_index(DestCityName, ',',1) as DestCityName from AirlineTable3").show()

# broadcast variables are used to save the copy of data across all nodes

In [None]:
from pyspark import SparkContext, broadcast
from pyspark.sql import SparkSession 
import pyspark.sql.functions as func

In [None]:
sc = spark.sparkContext

In [None]:
words_new = sc.broadcast(["AA"]) 

In [None]:
#call brodcast by using value
data = words_new.value # accessing the value stored in the broadcast in master

In [None]:
data

In [None]:
%time AirlineDF_Par.select("Year","UniqueCarrier").show()

In [None]:
# without Broadcast variable on filter
%time AirlineDF_Par.where((AirlineDF_Par['UniqueCarrier']).isin('AA')).count()

In [None]:
# Broadcast variable on filter
%time AirlineDF_Par.where((AirlineDF_Par['UniqueCarrier']).isin(data)).count()

#  accumulator variables are used for the information through associative and cummulative operations

In [None]:
#create sparkcontext
from pyspark.context import SparkContext
sc = SparkContext()

In [None]:
#creating a accumulator variable
accum=sc.accumulator(0.5)

In [None]:
#empty variable
accum.value

In [None]:
#create RDD
RDD=sc.parallelize([10,20,30,40,50])

In [None]:
RDD.foreach(lambda x:accum.add(x))

In [None]:
accum.value

In [None]:
RDD.foreach(lambda x:accum.add(x))

In [None]:
accum.value

# serialization in pyspark

# Serialization is used for performance tuning on Apache Spark

PySpark supports custom serializers for transferring data; this can improve
performance.

By default, PySpark uses L{PickleSerializer} to serialize objects using Python's
C{cPickle} serializer, which can serialize nearly any Python object.
Other serializers, like L{MarshalSerializer}, support fewer datatypes but can be
faster.

The serializer is chosen when creating L{SparkContext}:

In [None]:
spark.stop()

In [None]:
from pyspark.context import SparkContext
from pyspark.serializers import MarshalSerializer
sc = SparkContext("local", "serialization app", batchSize=2, serializer = MarshalSerializer())

In [None]:
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
spark.sparkContext is sc

In [None]:
#To create dataframe form External datasets
AirlineDF = spark.read.parquet("parquet/*")

In [None]:
# calculate total no. of flight by Unique Carriers
%time AirlineDF.select("UniqueCarrier").groupby("UniqueCarrier").count().show()

In [None]:
#check no. of partition
AirlineDF.rdd.getNumPartitions()

In [None]:
spark.stop()

In [None]:
from pyspark.context import SparkContext
from pyspark.serializers import PickleSerializer
sc = SparkContext("local", "serialization app", batchSize=2, serializer=PickleSerializer())

In [None]:
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
spark.sparkContext is sc

In [None]:
#To create dataframe form External datasets
AirlineDF = spark.read.parquet("parquet/*")

In [None]:
# calculate total no. of flight by Unique Carriers
%time AirlineDF.select("UniqueCarrier").groupby("UniqueCarrier").count().show()

In [None]:
#check no. of partition
AirlineDF.rdd.getNumPartitions()

# spark Parallelism

spark.default.parallelism is the default number of partition set by spark which is by default 200. and if you want to increase the number of partition than you can apply the property


One important way to increase parallelism of spark processing is to increase the number of executors on the cluster. However, knowing how the data should be distributed, so that the cluster can process data efficiently is extremely important. The secret to achieve this is partitioning in Spark.

# Note:- http://spark-configuration.luminousmen.com/

# for RDD use spark.default.parallelism

In [None]:
#creating a sparksession object and providing appName 
spark0=SparkSession.builder.appName("optimization0").getOrCreate()

In [None]:
spark0.conf.set("spark.default.parallelism",1)

In [None]:
sc1 = spark0.sparkContext

In [None]:
rdd2=sc1.textFile("/Users/krishnapratap/Desktop/partation/data/*")

In [None]:
sc1.defaultParallelism

In [None]:
# Split words using flatMap
rdd_word1 = rdd2.flatMap(lambda x: x.split(","))

In [None]:
# Create a paired-rdd
rdd_pair1 = rdd_word1.map(lambda x: (x, 1))

In [None]:
# Count occurence per word using reducebykey()
rdd_reduce1 = rdd_pair1.reduceByKey(lambda x,y: x+y)
%time rdd_reduce1.collect()

# for dataframe use spark.sql.shuffle.partitions

In [None]:
#creating a sparksession object and providing appName 
spark1=SparkSession.builder.appName("optimization1").getOrCreate()

In [None]:
spark1.conf.set("spark.sql.shuffle.partitions",50)

In [None]:
#To create dataframe form External datasets
AirlineDF5 = spark1.read.option("header", "true").csv("/Users/krishnapratap/Desktop/partation/data/*")

In [None]:
# calculate total rows in dataframe
%time AirlineDF5.select("UniqueCarrier").groupby("UniqueCarrier").count().show()