In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("Missing Data").getOrCreate()

In [2]:
df=spark.read.csv('C:/Users/User/Desktop/SparkFolder/Data/ContainsNull.csv', header=True,inferSchema=True)

In [3]:
df.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



# Dropping Missing Data

In [4]:
df.na.drop().show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [6]:
df.na.drop(thresh=2).show() # Must have two null values in a row before we can drop

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [7]:
df.na.drop(how='any').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [8]:
df.na.drop(how='all').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [9]:
df.na.drop(subset=['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



# Filling Missing Values

In [10]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)



In [11]:
df.na.fill("Fill").show()  # So by defualt you only fill string columns because you had a string in the fill argument

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| Fill| null|
|emp3| Fill|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [12]:
df.na.fill(0).show() # So by defualt you only fill numeric columns because you had a numeric in the fill argument

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|  0.0|
|emp2| null|  0.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [14]:
df.na.fill(0,subset=["Sales"]).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|  0.0|
|emp2| null|  0.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [17]:
df.na.fill("No Name",subset=["Name"]).show()

+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John| null|
|emp2|No Name| null|
|emp3|No Name|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+



# Fill Value with Mean Value

In [23]:
from pyspark.sql.functions import mean

In [24]:
df.select(mean(df["Sales"])).show()

+----------+
|avg(Sales)|
+----------+
|     400.5|
+----------+



In [28]:
df.select(mean(df["Sales"])).collect()

[Row(avg(Sales)=400.5)]

In [31]:
mean_value=df.select(mean(df["Sales"])).collect()[0][0]

In [33]:
mean_value

400.5

In [34]:
df.na.fill(mean_value,subset=["Sales"]).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [36]:
df.na.fill(mean_value,subset=["Sales"]).na.fill("No Name", subset=["Name"]).show() # Remember subset can optional

+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John|400.5|
|emp2|No Name|400.5|
|emp3|No Name|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+



In [38]:
df.na.fill(mean_value,["Sales"]).show()  # Subset can be optional demosntrated

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

