In [1]:
sc

In [2]:
spark

In [3]:
hr_employee = spark.read.csv("file:///home/hadoop/Downloads/HR_Employee.csv")

### Big DataFile Types
    Parquet FileFormat - records are column wise, this file format compress dataset of .csv or structured format into parquet format () parquet format will reduce file from original to compress by reducing filesize
    similar to this there are otheer file formats - orc, avro (stored records as keys and values, schema format is stored as JSON format)

In [6]:
#write csv file to local directory as parquet file format 
#hr_employee.write.parquet("file:///home/hadoop/Downloads/HR/")

In [7]:
#write csv file to local directory as parquet file format 
#hr_employee.write.orc("/HR_Data/")

In [8]:
!hdfs dfs -rm -r /HR_Data/

Deleted /HR_Data


In [9]:
hr_employee_parquet = spark.read.parquet("file:///home/hadoop/Downloads/HR/").show()

+----------+--------------------+--------------------+---------+------+---+-------------+-------------+--------------+-----------------+--------------+--------+---------------+----------+------+----------+--------+------+--------------------+---------------+--------------------+---------------+------------------+
|       _c0|                 _c1|                 _c2|      _c3|   _c4|_c5|          _c6|          _c7|           _c8|              _c9|          _c10|    _c11|           _c12|      _c13|  _c14|      _c15|    _c16|  _c17|                _c18|           _c19|                _c20|           _c21|              _c22|
+----------+--------------------+--------------------+---------+------+---+-------------+-------------+--------------+-----------------+--------------+--------+---------------+----------+------+----------+--------+------+--------------------+---------------+--------------------+---------------+------------------+
|EmployeeID|          Department|             JobRole|A

#### Optimization techniques 
    optimizing queries significantly improve performance of spark running quries 
    spark jobs

#### 2. Partitioning 
    Divides datas into smaller chunks which can divide process datas into parallel 

In [10]:
partitioned_df = hr_employee.repartition(3)

In [12]:
#partitioned_df.write.parquet("/HR_Partition/")

#### 3. Caching & Persistance
    - Managing different level of storage

In [13]:
hr_employee.cache()

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string, _c12: string, _c13: string, _c14: string, _c15: string, _c16: string, _c17: string, _c18: string, _c19: string, _c20: string, _c21: string, _c22: string]

In [14]:
# persistance of dataframe with specific storage level 

from pyspark import StorageLevel
hr_employee.persist(StorageLevel.MEMORY_AND_DISK)

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string, _c12: string, _c13: string, _c14: string, _c15: string, _c16: string, _c17: string, _c18: string, _c19: string, _c20: string, _c21: string, _c22: string]

#### 4. Serialization
    - Efficient Serialization Reduces time to read write data and transfer it over the network
    Kyro Serialization is popular methid for better performance over default java serialization 

##### a) Java serialization : It is default method. easy to use, draw back is it will slow down read write process. It can produce large serialized sizes.

In [15]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Java Serialization Example").getOrCreate()

##### b) Kyro Serialization : Faster

In [None]:
"""
spark = SparkSession.builder \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryo.registrationRequired", "true") \
    .config("spark.kryo.classesToRegister", "org.apache.spark.example.anyClassname") \
    .appName("Kryo Serialization Example") \
    .getOrCreate()

"""

In [16]:
spark = SparkSession.builder.appName('PySparkSession')\
.config("spark.serializer","org.apache.spark.serializer.JavaSerializer").getOrCreate()

In [17]:
spark

#### 5. Broadcasting Variable
    * broadcasting small datasets improves join performance.

In [18]:
small_df = spark.read.csv("file:///home/hadoop/Downloads/airports(1).csv",inferSchema=True, header=True)
df = spark.read.csv("file:///home/hadoop/Downloads/raw_flight_data.csv", inferSchema=True, header=True)

In [19]:
from pyspark.sql.functions import broadcast

broadcast_df = broadcast(small_df)

In [24]:
broadcast_df.head()

Row(airport_id=10165, city='Adak Island', state='AK', name='Adak')

In [32]:
df.join(broadcast_df, df.OriginAirportID == broadcast_df.airport_id)

DataFrame[DayofMonth: int, DayOfWeek: int, Carrier: string, OriginAirportID: int, DestAirportID: int, DepDelay: int, ArrDelay: int, airport_id: int, city: string, state: string, name: string]

In [33]:
df.printSchema()

root
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- Carrier: string (nullable = true)
 |-- OriginAirportID: integer (nullable = true)
 |-- DestAirportID: integer (nullable = true)
 |-- DepDelay: integer (nullable = true)
 |-- ArrDelay: integer (nullable = true)



#### 6. Level of Parallelism

In [None]:
#Adjust level based on cluster 
spark.conf.set("spark.default.parallelism",100)

#### 7. Avoid Groupby Key()

In [None]:
# it increases number of shuffles

In [23]:
rdd = spark.sparkContext.parallelize([('dosa',2),('salad',3),('Idili',2),('appam',10)])

rdd.reduceByKey(lambda x,y:x+y).collect()

[('salad', 3), ('appam', 10), ('dosa', 2), ('Idili', 2)]

#### 8. Reduce Shuffle
    Reduce the number of shuffle by optimizing transformations
    use map() andd   reduce() over groupby
    use reduceByKe() over groupby
    

#### 9. Repartition with coalesce

#### 10.Accumulators
    -Aggregate info like count(),sum(),max(),std(),corr() etc.

In [26]:
acc = spark.sparkContext.accumulator(0)

In [28]:
rdd = spark.sparkContext.parallelize([1,2,3,4,5,6,7,8,9])

In [29]:
def add(x):
    acc.add(x)

In [30]:
rdd.foreach(add)

In [31]:
print(acc.add)

45


#### 11.Bucketing
    - use to create Buckets of large datasets for effiecient query and joins

In [40]:
df.select("DayOfWeek").distinct().count()

7

In [42]:
df.write.bucketBy(7, "DayOfWeek").saveAsTable("bucketTable")

In [43]:
!hdfs dfs -cat /user/hive/warehouse/buckettable/part-00000-be8a4513-072b-4d1d-b90b-bd863740bc69_00001.c000.snappy.parquet

PAR1hlL  4�                        	                ����,��       (         ��D   ��� �i2v � !!#1  1002  ""2"" !0"%2#2!0	2!3*21"K$2101(2"20""!d*"10 "),2"0 2!"��2!	�!	�<0!# "22#�""2"2!���!!�001 ��2"0��2!!N   0""")21�! m#=22 �1)#:X 2� 5  !� "#�)S0� ##$102"%!�!X10;!+2""1!2	�7!}!2q JH2#3 1 !3!")`%�1!Q03"	6 #!p !�3� !3%�0!� � i!gAV21-� "GA !7�1!� �2"0�!P� "DEN 2!� [%|Al8  022 6 !�!)2;!!#2%QE�!031)  x!L!"*12!A�� "!)A�0� !2AV:""2  33E�A�"0"9l  !w !,!"02 A�"�1A�#!A�1 ) �a+A� AZ #2A| !AX"#b)�I�A��aD "!�1�af "~ !e�2 #A�6�	�0#"S 0!�	��q0!�I� 	� alAaTa\ �I� "�j

  �!��$ ��"@%q�� �8�,Pl @	`�p "@ @�  3�e �F� � " ��R $�` @�@`W @��*� "��a80L* 㣊B��e � �40�R�@"0T�He �h�@�`�   @ (C � p>�g0x ���1� ��i`x(    xB�#`(���� 
E���tx<�N���  �P� �P �  ���0�^� x" ƀ�P� �E�@  ��O  ]�@ ��A� �   �� �ʆB <�P�0��@� ('Ed�B��� Z d� . Cd ��� 0��2 AZ��� .��e 	  �'1 .@ x�P @���4$ @b!�d�"�� b��B  �0  	PQ�� � 6 \����|��`L``� �@(8��� Ќ MȀ	 ��   `pH>�¡ �̠�  ��� 8 .��!g � �'� �`8 8	�@, �4�$�  �R $K$@RL8 �	 C`RL�M � ��t�����D �� 	 �`@    	�S%� @F� �% @�( �b2 l  E �0`FS 0�$ ����n �S :��FCP<��	D  ��n���d2�@ :��I$�D���~?�H$B�B� �.$h� EDap��J� �h� j� `9Z ��� ` �  ! �1� x� ���-�3h �$ � +�z �h @1�   �T �Y 	Rl �j
�%� �  � 0� h � "@ �Qx� �X@ � S� ��0��  aP	 d  �Q�j�1����   �2T X��P(�@L� ��@�� �@

`��4�a�)�J!> P�P�As�D�J�@�V�,BJ  �z=��	 ;��	��	��j0�p ��0�h4�#>�?B�Z-
���R PA�:�n|V�8q4
BF�,����	����)� �� �"�B^�'�+��
�:��m� 
�����$�L��� 
�@ <H�,�A �7�!N�f>&�i� �& �� �&�"�
�H�9	�!�J�:�
4W=�({ �! `>
4H�T&+�J��
�Q2W�z�	�D"��! �R	7[�V<��r�\-	5a���M��l �^-�!  ��Y���n )�a���)/�@�oQzKj�Ta=ш�K+�����Y�K�
�"�?�T��p�F0zt�Wk4�P( 	�c	��|bv(G�c	�
7����$��=T�r�o "�
��A@ !�+# �R�� ���!3��(D�r�\�t:�Rl ,!��R�����6�B��� ���l4�`�%F��
0j7 V7�4���a0��oZ"wI���D��D ��(8R)�X)�RQ��!�� �P(|$@ �P��Z��$,0,���L$�J�x<���R�@^�m6���!��#�A�(!�j5Z.�+�T*)o ��JG�Ah��`���b X	�0�a�	�X���x4��K"V �v[���R��`	I	� �������

�F� (<�� ���c�/D�����`�����@h
������6�����>4�Qh�>
��`4���I(>ڃ��%4�BCCyPh�����h�M�� l(����%�u�6�G���>�V�=>� ���Q>�O��(ڀAi�(��B�
�rx<����}��f9����h�(� �BA1>� %Hq$,�c�h8(G��h<(���� (<��I���0�>���Qhʈ��<����=��F#5H�,C	0��A�T�GyT>���CA�>���i�,����Q���à9��E������ 4�G �}(�Ɔ��X��SyX�G��`|,�ǇҠ5h��Ơ�x#�GCK}2���WH=B�W����()�A�5^퓢}><Z�� =h��֠U>���y��C�X0,
%Ey0(���A��x��CF�Q>���yd����<���A�<R��x�(��A	}��Gs��(.��#<h���Plh@C��<���R�)ңe#��|"�L����4�Fc4�mA�9B�ƣ�{d>���ָP4�GsXh4�� 0>)cC�Z�	P���B�}���xph�O��a4>�G E!�(�@a������!94���q�0x�M�Fq�,���h���P<d�GC#�(<����hނwx|4҇B�4Z��X<x�����&��P�ף�h|~)ڍ�=6��Cy<nだpZ��`<h�1��>��BA��F��i�H�B�A� (�O�qj4(�OÁx0��=h���4�G���P�FCf!5��i h����0�A�(��c�P M�5�h�y|(Z�A|F�A{$����^��A{(> ��C+�
��AS�44����n���<h��{0>��Ǡ0>M�C�0��à���Ƣ1>��AS,x������#���"�h<#�)���� <��|�T�g� h^��h�6�

�A!���h�!E@ 	�AA#��l@�	p0B��  p�  Cp�  �@&� ��p��; @ E@ ��PF��(<0(8��,0Bd�4� � J $�Ap� �@�8Q �,�@:<@�L@�@@؀  lq�(� 8�@ S<��ܰ��� �AT���l @�( �,������x��d�l��� 
P�,�@jxA!�@	�CL1A'1�@�,F� <��<�4<����
K�p���Up �l1 �Np@�P0�8*8po8�C@Ѐ<�B �@ 
P"�@@ �P<�
3���0	+4 ���P�H��) �B!t�A@AA�L�@ 
 ��,@@  �
�2�P�t�P<� #pp  �lp�	�B@  H`Dd� 8 ��P D��	;p0�/+t@(����@AF�C D427ԁ@�P@!�S�(@
��H8 � (B�B(@�2p�A0$
�@�p�  p��<��K @ � 0A p@�t"0Bj8` ��0��� ��@�6��@ 4h��k��,��I<� 4d�l� 
�@C 6,�	�p0A�d@ ��
(<��	А�- ����!KԱ�"� D�
8���B<@�� 6�0���6� �@���`� �P�/�(��� ��iP��	@����A$�B�-,p�- @A<@��p����  @ Q�@ 'd��@'<@A  T��A 6��A^�0�� �

C����P A �l1(P@+�#�@ `CCP�� Cp�A/�\5��AA @��� ��p 8��8��<@ �� 
 , SP0(��0�xp�� �� �<��8� ( �����3�p�������<�B 4L��Ђk �BP@ p�
���P��h��q@;d�@8�8���� 4p �+Ѐ�� @
d����@ 8�  <����4p`� ��E��AY��@ >4�A6�� +@�
l�
���,�� � ��P$�@ #p��2���#, +� B
�sb� xp��q�ep��l�HCt�� 6�0CPp@ 80���@i���
p0�)�@  @ @�,� H@�<�@X!B
)dp@�C;l�@ @���8@  ����@�$w�
"��� A�P�0�p��P� @�� � �аC� �h��P@0 H�<@ P 8 -HP� $1�9��	- @@ �@ �@@ #�����<@ ��A$�@A��pC
q��@ #�P�6,�#��@�@
��   �(P�@-P�A �a�<��4 � ��tp (C�%@Dp����(8@@ K�B��@6  R�	DaC*Ԁ
��@( @*4`C< @�+x��"8@�p@Ct�i �6�`�����(@��Lr��d�
�Ap@@ppLp���B���PG(<0�  P�0�� 
*����p@ Tt�
؀
/  ��� ���28 �<�D$���4�AV

C4R�@x@4�B0D`�3>��H4� E'L@ p@��ЉP��P� �Г� . �0@�t��$��(@ �A \A{Lp�L@A ��1Dp04��	4BCP��P�� �(��C��fP <@� 1� �C�1
@A XA��� ?48pB  1�C� Zp0�C ��9� ���A'P0���j#JEp�  @D�@ �}(�� ���(0  �@  � �s   2H"CLq��� 8� +eh� �8�p��C<������4A�4X��8�LaI p�B ,�W$AF � Ch0Ah� ���D	X @ ��l�
-8 "��[�Ќ4 d8@48�@�� D8@ � 
d� H�	p@8P@�H�l��"H"�,Y�� H�@P��� (��
 @H���D���P� D� f p@0�@
 �A'!8��Cl0%�t���P��3P0   �Dl�@;�@4P� V�@�="P0�"l <�P�  l`_W`CP��;� Ix� d@�h@

@����p���s ��4�=H0�tpEdP  -p��ep�K�Є� �2O C �M<�B l��Hp�<@ p@ ����pBL�� �
�<p 8�P��"���K���h@ d �p�@Ѐ [,�p�@H� d� �$�Rj0P��H�*�Pp!����@��< �(���Ԡ�Pp�%�@0� @ p�I#�d�F��@9I�P@ B

/0`�B-���t� ��� %�`�	�`�(�@�AA4��,�0 O�`�#� L� ��%L0B%�B&$a 7��@�H�
�`D4��\\ x#��@@D�Ѓ&�D#4@C&�`� &�@A&� ��!R{ ��4�P�;DX ?�AB1�@ 0Q���7 ����Ѐ&䠇-,�e��B/<@	5`��(.L�@=�B L�F?�Xq�x�	=�a@* �S 0���C,İ	'x �%@�!� ��A Xp�7�p�&��  �a�\P܌2�941E-�@D��AK@Bf�&��� ,`��(�0�I��N,� B6�0�\T� @C9�p����!� F� ��:�p�=�"��qDRt� k`�
� A$� @1X��?��� \ � �Ђ	(�B!� D,4sK �
���X��<	<��"4��R9 0�8Ȱ	g��1x��
0��*J�ЃK,�A%|��D��$�P@��1�(���2���/��E�`����	�@A-T@A�h�� ��u`�$(�I1L���*Q��2�ڠ�  a�41V�B H�� BH��q=d"�K  �
P � b��A98d�	��`�D�@	-,!E��k�!��0��� -�?���- �K4��4���X��<V���/,5B4<�D� ,"� 0� �dl�Ig4��*I��b�CD4U���� A��Q�4�@�8� #- � �@7\�E�
l� <�-��@ ,��	,<PB � �,���!�9�p�5�,��4< �/ @� �� ��8���-�@��i��E O@�Bx� JpaD*t �Sp��S��B�	-�c %�Q 4, �!��� ��� 
� �B)~А

P@	0�q|PЁ9LЂ	��B��fL@	e�`�/pS�8�`�,0��J���8��	kt`�(/��!@� k|�1�bDR��?���%��
44P�%�����@ pC�<l� �\3
&X��,` �P`�0A;2,��E���4�0�<�D,�@6L�B-�1�7��x@�-L@��
� `�E�Q�$�t�� ��Hr���(w� AMda�0� IL@�9�d"�$L�B8� r�0� �4���@�-�� ����� ��P���8���Je�/@&�Ee���KL���0�I�BC0@^4�B,P�K��/,�B&�Ё��@�Ё(@���� @/` D�\P A �`���@�8��C �PA/S��I�R�0 C�b�	?X DA�5��J, G�P�4�A `�?��G) �P�'=���2�`�v������}����x`�ʹ��\X���D&�� �cMЁ/- �P L��t0 
aF��Յ��#H�!8���lܲ�
=<@������h�� &0J �B6x�/��D2��@Q��C�@,�A�\`�`B1��Y�0M>1��	f�0@ F| �z@� �Pd� �3�1, CQ @� 9<��%  &��:����,�B
9��� r���B�`�;��Xt�7� �<�B  | ��	4<����@;4h�� =,1�@*E�a@��%� �D  D"w @�.�(�AD4#��p�@�%@@�@�/&  �v ��KQ�/PP�	%P�EU���O�%X����� K�  0�9 0�D̰,�pG4@� f�

�3 &,C1,PR��A4 ����%$��N�b%� B?�т$Q@K�D��B�`�% @�A�4��%��p�� �� ��/tP��	�p�=h@{�@�74 C R�	V�@ -`@4AC��= 2B ��{���@aB	1��K P�� ������@�uHC	E�-��EPpF�@� @,Q���-�@DD, /��,����	�pI-$a�	4�0 &�A*�`�(AQ�@C ࠉ	�0�g��tPA	Dp�E� -�  � 0���V p �@0	�@�t0�� ./	�4A`�(V��w�0� ,�0�I$���	#԰[��B�� j6$�1h#�A|$K  ��%,�C {��/ � 7�! N,,P�(�\P@��:/�A\0�� �k �:U��/h��)
$1E�|@�x��
3�Q/&x��1{�L,Vؔ�8�p�4�Hdr� &\P9 �
Ctb�r�P@64�`8q�- d��#�@K�@F���;� �@-&�0D SH��:0 C�@D��DXѐ|3J. Å�B�b�O@��!O�P6�`�P���p�� ��Bl�@�S�p�$�  x4R�cF9#��@� �0@P��PM (�����P�@M�>( @� 0A~�0�N@Ԃ
&X0�#�@0,� (`�KD�2< 2L��  `�5ܔ�	v�1EJ�@�s�!�l0�B�����A�$��B �p�K�a�/`�,|0��a���E$�B-�(P����jrCS,�����[�@F_X�)���<�0�p	O���IX@/���
(Ȱ�������x �06<1!B|1�)2! 