In [1]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("MyApp") \
    .master("local[*]") \
    .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.4.0") \
    .getOrCreate()

# create dataFrame

In [6]:
employees= [(1,'kiran',150.0,0),
           (2,'chinta',124.90,''),
           (3,None,80.0,10),
           (4,'gowthu',90.0,None),
           (5,'manchi',None,15)]

e_df = spark.createDataFrame(employees, schema = "id INT, name STRING, sal FLOAT, bonus STRING")
e_df.show()

+---+------+-----+-----+
| id|  name|  sal|bonus|
+---+------+-----+-----+
|  1| kiran|150.0|    0|
|  2|chinta|124.9|     |
|  3|  NULL| 80.0|   10|
|  4|gowthu| 90.0| NULL|
|  5|manchi| NULL|   15|
+---+------+-----+-----+



# dropna/na.drop()

In [7]:
e_df.dropna().show()

+---+------+-----+-----+
| id|  name|  sal|bonus|
+---+------+-----+-----+
|  1| kiran|150.0|    0|
|  2|chinta|124.9|     |
+---+------+-----+-----+



In [8]:
e_df.na.drop().show()

+---+------+-----+-----+
| id|  name|  sal|bonus|
+---+------+-----+-----+
|  1| kiran|150.0|    0|
|  2|chinta|124.9|     |
+---+------+-----+-----+



In [9]:
e_df.dropna(subset=['name']).show()

+---+------+-----+-----+
| id|  name|  sal|bonus|
+---+------+-----+-----+
|  1| kiran|150.0|    0|
|  2|chinta|124.9|     |
|  4|gowthu| 90.0| NULL|
|  5|manchi| NULL|   15|
+---+------+-----+-----+



# fillna/na.fill()

In [10]:
# only with that datatype is changes
e_df.fillna('na').show()

+---+------+-----+-----+
| id|  name|  sal|bonus|
+---+------+-----+-----+
|  1| kiran|150.0|    0|
|  2|chinta|124.9|     |
|  3|    na| 80.0|   10|
|  4|gowthu| 90.0|   na|
|  5|manchi| NULL|   15|
+---+------+-----+-----+



In [11]:
# only with that datatype is changes
e_df.fillna(0.0).show()

+---+------+-----+-----+
| id|  name|  sal|bonus|
+---+------+-----+-----+
|  1| kiran|150.0|    0|
|  2|chinta|124.9|     |
|  3|  NULL| 80.0|   10|
|  4|gowthu| 90.0| NULL|
|  5|manchi|  0.0|   15|
+---+------+-----+-----+



In [12]:
# only with that datatype is changes
e_df.na.fill(0.0).show()

+---+------+-----+-----+
| id|  name|  sal|bonus|
+---+------+-----+-----+
|  1| kiran|150.0|    0|
|  2|chinta|124.9|     |
|  3|  NULL| 80.0|   10|
|  4|gowthu| 90.0| NULL|
|  5|manchi|  0.0|   15|
+---+------+-----+-----+



In [14]:
e_df.fillna(0.0,'sal').fillna('na','name').fillna('na','bonus').show()

+---+------+-----+-----+
| id|  name|  sal|bonus|
+---+------+-----+-----+
|  1| kiran|150.0|    0|
|  2|chinta|124.9|     |
|  3|    na| 80.0|   10|
|  4|gowthu| 90.0|   na|
|  5|manchi|  0.0|   15|
+---+------+-----+-----+



# replace/na.replace
- generally they are used to convert non-null values to null values
- ex: empty string to null

In [26]:
e_df.replace('',None).show()

+---+------+-----+-----+
| id|  name|  sal|bonus|
+---+------+-----+-----+
|  1| kiran|150.0|    0|
|  2|chinta|124.9| NULL|
|  3|  NULL| 80.0|   10|
|  4|gowthu| 90.0| NULL|
|  5|manchi| NULL|   15|
+---+------+-----+-----+



In [27]:
e_df.na.replace('',None).show()

+---+------+-----+-----+
| id|  name|  sal|bonus|
+---+------+-----+-----+
|  1| kiran|150.0|    0|
|  2|chinta|124.9| NULL|
|  3|  NULL| 80.0|   10|
|  4|gowthu| 90.0| NULL|
|  5|manchi| NULL|   15|
+---+------+-----+-----+



# coalesce

In [31]:
from pyspark.sql.functions import coalesce,lit,col

e_df\
    .replace('',None)\
    .withColumn('bonus_1',coalesce(col('bonus').cast('int'),lit(0)))\
    .show()


+---+------+-----+-----+-------+
| id|  name|  sal|bonus|bonus_1|
+---+------+-----+-----+-------+
|  1| kiran|150.0|    0|      0|
|  2|chinta|124.9| NULL|      0|
|  3|  NULL| 80.0|   10|     10|
|  4|gowthu| 90.0| NULL|      0|
|  5|manchi| NULL|   15|     15|
+---+------+-----+-----+-------+



# case & when

In [38]:
# case
from pyspark.sql.functions import expr

e_df\
    .withColumn('bonus_1',
                expr("""CASE
                        WHEN bonus is NULL or bonus = '' then 0
                        ELSE bonus
                        END"""))\
    .show()

+---+------+-----+-----+-------+
| id|  name|  sal|bonus|bonus_1|
+---+------+-----+-----+-------+
|  1| kiran|150.0|    0|      0|
|  2|chinta|124.9|     |      0|
|  3|  NULL| 80.0|   10|     10|
|  4|gowthu| 90.0| NULL|      0|
|  5|manchi| NULL|   15|     15|
+---+------+-----+-----+-------+



In [44]:
# when & otherwise
from pyspark.sql.functions import when,col,lit

e_df\
    .withColumn('bonus_1',
                when(((col('bonus').isNull()) | (col('bonus') == lit(''))),0).otherwise(col('bonus')))\
    .show()

+---+------+-----+-----+-------+
| id|  name|  sal|bonus|bonus_1|
+---+------+-----+-----+-------+
|  1| kiran|150.0|    0|      0|
|  2|chinta|124.9|     |      0|
|  3|  NULL| 80.0|   10|     10|
|  4|gowthu| 90.0| NULL|      0|
|  5|manchi| NULL|   15|     15|
+---+------+-----+-----+-------+



# understanding case & when

In [46]:
persons = [(1,2),(2,13),(3,18),(4,60),(5,120),(6,0),(7,12),(8,160)]
p_df = spark.createDataFrame(persons,schema="""sno INT, age INT""")
p_df.show()

+---+---+
|sno|age|
+---+---+
|  1|  2|
|  2| 13|
|  3| 18|
|  4| 60|
|  5|120|
|  6|  0|
|  7| 12|
|  8|160|
+---+---+



In [47]:
# case with expr
from pyspark.sql.functions import expr

p_df\
    .withColumn('category',
               expr("""
                   CASE
                   WHEN age BETWEEN 0 and 2 THEN 'New Born'
                   WHEN age >2 and age<=12 THEN 'Kid'
                   WHEN age>12 AND age<= 19 THEN 'Teen'
                   when age>19 AND age<=75 THEN 'Adult'
                   ELSE 'Senior'
                   END
                   """))\
    .show()

+---+---+--------+
|sno|age|category|
+---+---+--------+
|  1|  2|New Born|
|  2| 13|    Teen|
|  3| 18|    Teen|
|  4| 60|   Adult|
|  5|120|  Senior|
|  6|  0|New Born|
|  7| 12|     Kid|
|  8|160|  Senior|
+---+---+--------+



In [48]:
# using when& otherwise
from pyspark.sql.functions import when,col,lit

p_df\
    .withColumn('category',
               when(col('age').between(0,2),'New Born')\
                .when(((col('age')>2)&()),'Kid')\
                .when((()&()),'Teen')\
                .when((()&()),'Adult')\
                .otherwise('Senior')
               )\
    .show()