In [2]:
import findspark
findspark.init('/home/amit/spark-2.1.0-bin-hadoop2.7')

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Missing').getOrCreate()


In [5]:
df = spark.read.csv('./Python-and-Spark-for-Big-Data-master/Spark_DataFrames/ContainsNull.csv', inferSchema=True,header=True)

In [6]:
df.columns

['Id', 'Name', 'Sales']

In [7]:
df.head(2)

[Row(Id='emp1', Name='John', Sales=None),
 Row(Id='emp2', Name=None, Sales=None)]

In [8]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)



In [9]:
df.count()

4

In [10]:
df.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



#### Dropping rows by threshold

In [12]:
df.na.drop().show()
# drop rows with any null values

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [19]:
df.na.drop(thresh=2).show()
# with thresh , we can speciyf to drop rows that have less than 'thresh' non-null values


+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [20]:
df.na.drop(thresh=2, subset=['Name', 'Sales']).show()
# with thresh , we can speciyf to drop rows that have less than 'thresh' non-null values
#  subset pareamet tis to specify whhich columns to consider

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [21]:
df.na.drop(subset=[ 'Sales']).show()


+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [22]:
df.na.fill('FILL_VALUE').show()

+----+----------+-----+
|  Id|      Name|Sales|
+----+----------+-----+
|emp1|      John| null|
|emp2|FILL_VALUE| null|
|emp3|FILL_VALUE|345.0|
|emp4|     Cindy|456.0|
+----+----------+-----+



In [23]:
df.na.fill(12,).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| 12.0|
|emp2| null| 12.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [24]:
df.na.fill({'Name':'FILL_VALUE', 'Sales':0}).show()

+----+----------+-----+
|  Id|      Name|Sales|
+----+----------+-----+
|emp1|      John|  0.0|
|emp2|FILL_VALUE|  0.0|
|emp3|FILL_VALUE|345.0|
|emp4|     Cindy|456.0|
+----+----------+-----+



In [27]:
df.na.fill(0,['Name','Sales']).show()
# smart enought to not fill numeric value for stirng dypes

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|  0.0|
|emp2| null|  0.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [28]:
from pyspark.sql.functions import mean

In [39]:
mean_val= df.select(mean(df['Sales'])).collect()

In [40]:
mean_sales = mean_val[0][0]

In [42]:
# df.na.fill(df.select(mean(df['Sales'])).collect()[0][0]).show()
df.na.fill(mean_sales,subset=['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

