In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('test1').getOrCreate()
spark

In [3]:
data = [(None, 'CA'), ('Julia',''), ('Ram', None), ('Ramya', 'NULL')]
df = spark.createDataFrame(data, ['name', 'state'])
df.show()

+-----+-----+
| name|state|
+-----+-----+
| NULL|   CA|
|Julia|     |
|  Ram| NULL|
|Ramya| NULL|
+-----+-----+



In [5]:
# Find count of non null value of a column 
from pyspark.sql.functions import * 

df.filter(col('name').isNotNull()).count()

3

In [6]:
# Count of non-null for all columns 

df.select([count(when(col(c).isNotNull(), c)).alias(c) for c in df.columns]).show()

+----+-----+
|name|state|
+----+-----+
|   3|    3|
+----+-----+



In [7]:
# Count of non-null by ignoring null literal values 

df2 = df.select([count(when(~col(c).contains('None') & \
                           ~col(c).contains('NULL') & \
                           (col(c) != '') & \
                           ~col(c).isNull(), c)).alias(c) for c in df.columns])
df2.show()

+----+-----+
|name|state|
+----+-----+
|   3|    1|
+----+-----+



In [8]:
# Find count of non nan values of DataFrame column
import numpy as np

data = [(1, 340.0), (1, None), (3, 200.0), (4, np.NAN)]
df = spark.createDataFrame(data, ['id', 'number'])
df.show()

+---+------+
| id|number|
+---+------+
|  1| 340.0|
|  1|  NULL|
|  3| 200.0|
|  4|   NaN|
+---+------+



In [9]:
df.filter(~df.number.contains('None') & \
         ~df.number.contains('NULL') & \
         ~isnan(df.number) & \
         ~df.number.isNull()).count()

2

In [10]:
spark.stop()