# Missing data with spark DataFrames

In [1]:
from pyspark.sql import SparkSession
from pathlib import Path
data_dir = Path('../pyspark/Python-and-Spark-for-Big-Data-master/Spark_DataFrames')

In [2]:
spark = SparkSession.builder.appName('miss').getOrCreate()

21/07/19 08:15:41 WARN Utils: Your hostname, GBLON1WLZ13699 resolves to a loopback address: 127.0.1.1; using 10.164.85.96 instead (on interface eth2)
21/07/19 08:15:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/07/19 08:15:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
df = spark.read.csv(str(data_dir/'ContainsNull.csv'), header=True, inferSchema=True)

                                                                                

In [5]:
df.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



Dropping missing values

In [7]:
df.na.drop(thresh=2).show()  # must have at least 2 non-null values to be kept

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [10]:
df.na.drop(how='all', subset=['Name', 'Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



Filling missing values

In [11]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)



In [12]:
df.na.fill('FILL VALUE').show()

+----+----------+-----+
|  Id|      Name|Sales|
+----+----------+-----+
|emp1|      John| null|
|emp2|FILL VALUE| null|
|emp3|FILL VALUE|345.0|
|emp4|     Cindy|456.0|
+----+----------+-----+



In [13]:
df.na.fill(0).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|  0.0|
|emp2| null|  0.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [31]:
from pyspark.sql.functions import avg, mean

In [29]:
avg_sales_name = 'avg_sales'
avg_sales = df.select(avg('Sales').alias(avg_sales_name)).collect()[0][0]
avg_sales

400.5

In [30]:
df.na.fill(avg_sales, subset='Sales').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



Or, in one line:

In [32]:
df.na.fill(df.select(avg('Sales')).collect()[0][0], 'Sales').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

