# Missing Data

In [45]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
spark=SparkSession.builder.appName("Miss").getOrCreate()

In [46]:
df=spark.read.csv("C:/Users/User/Desktop/Data/ContainsNull.csv", header=True)

In [47]:
df.show(5)

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [48]:
spark

# Drop nulls

In [49]:
df.na.drop().show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [50]:
df.na.drop(thresh=2).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [51]:
df.na.drop(how="all").show()  # Drops onpy if all row are null

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [52]:
df.na.drop(how="any").show()  # drop rows based on any column been null

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [53]:
df.na.drop(subset=["Sales"]).show()  # Subset can take str, tuples or list

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [54]:
spark

# Fill in nulls

In [55]:
df.na.fill("missing").show()

+----+-------+-------+
|  Id|   Name|  Sales|
+----+-------+-------+
|emp1|   John|missing|
|emp2|missing|missing|
|emp3|missing|  345.0|
|emp4|  Cindy|  456.0|
+----+-------+-------+



In [56]:
df.na.fill("111",subset="Sales").show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|  111|
|emp2| null|  111|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [57]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: string (nullable = true)



In [66]:
# Change Schema
schemma=StructType([
StructField("Id",StringType(),True),
StructField("Name",StringType(),True),    
StructField("Sales",FloatType(),True)
]
)

In [67]:
df=spark.read.csv("C:/Users/User/Desktop/Data/ContainsNull.csv", schema=schemma)

In [68]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: float (nullable = true)



In [69]:
df.show(5)

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|  Id| Name| null|
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [72]:
mean_value=df.select(mean("Sales")).collect()

In [76]:
mean_value[0][0]

400.5

In [81]:
df.na.fill(mean_value[0][0],["Sales"]).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|  Id| Name|400.5|
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

