In [20]:
## Import Libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StringType, DoubleType, StructType
from pyspark.sql.functions import mean

In [2]:
## Create Spark Session
spark = SparkSession.builder.appName('MissingData').getOrCreate()

In [5]:
## Setup Schema
schema = StructType(fields=[StructField('id', StringType(), True),
                            StructField('name', StringType(), True),
                            StructField('sales', DoubleType(), True)])

In [6]:
## Read in Data
df = spark.read.csv('gs://spark-training-data/datasets/ContainsNull.csv', inferSchema=False,
                    schema=schema, header=True)

In [7]:
## Show Data
df.show()
df.printSchema()

+----+-----+-----+
|  id| name|sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- sales: double (nullable = true)



In [8]:
## Show column names (attribute)
df.columns

['id', 'name', 'sales']

In [9]:
## Statistical Summary of df
df.describe().show()

[Stage 4:>                                                          (0 + 1) / 1]

+-------+----+-----+-----------------+
|summary|  id| name|            sales|
+-------+----+-----+-----------------+
|  count|   4|    2|                2|
|   mean|null| null|            400.5|
| stddev|null| null|78.48885271170677|
|    min|emp1|Cindy|            345.0|
|    max|emp4| John|            456.0|
+-------+----+-----+-----------------+



                                                                                

In [11]:
## Dropping missing data
df.na.drop().show() # Not in place

+----+-----+-----+
|  id| name|sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [12]:
## Dropping missing data with threshold
df.na.drop(thresh=2).show() # Must have at least two non-null values

+----+-----+-----+
|  id| name|sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [14]:
## Dropping missing data with how
df.na.drop(how='any').show() # At least one null
df.na.drop(how='all').show() # All rows null

+----+-----+-----+
|  id| name|sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+

+----+-----+-----+
|  id| name|sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [15]:
## Dropping missing data with subset
df.na.drop(subset='sales').show() # Sales missing

+----+-----+-----+
|  id| name|sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [18]:
## Fill in missing data - Natively pyspark will match data types
df.na.fill(0).show()
df.na.fill('No Name', subset=['name']).show() # Fill in values with subsettting

+----+-----+-----+
|  id| name|sales|
+----+-----+-----+
|emp1| John|  0.0|
|emp2| null|  0.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [29]:
## Fill in with mean value
mean_sales = df.select(mean(df['sales'])).collect()[0][0]
df.na.fill(mean_sales, ['sales']).show()

+----+-----+-----+
|  id| name|sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

