
---

# **Experiment Name:** Pyspark  Dataframe - Handaling missing values
# **Experiment No:** 03
# **Experiment Date:** /08/2023

---


In [1]:
# Firstly installing all the tools once again like Lab 01

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
!tar -xf spark-3.4.1-bin-hadoop3.tgz

!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"

import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

spark

In [2]:
from google.colab import files
files.upload()

Saving dataset_lab3.csv to dataset_lab3.csv


{'dataset_lab3.csv': b'Name,Age ,Experience,Salary\r\nDipjol,50,10,30000\r\nNasrin ,40,8,20000\r\nRiaz,,2,10000\r\nShabnur,45,,\r\nPurnima,30,5,20050\r\nMousumi,,,\r\n,30,7,25000\r\nSalman,35,8,65000\r\n'}

In [3]:
df = spark.read.format('csv').option("header", "true").option("inferschema", "true").option("mode", "failfast").load("dataset_lab3.csv")

In [4]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age : integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [5]:
# To show the whole dataset

df.show()

+-------+----+----------+------+
|   Name|Age |Experience|Salary|
+-------+----+----------+------+
| Dipjol|  50|        10| 30000|
|Nasrin |  40|         8| 20000|
|   Riaz|null|         2| 10000|
|Shabnur|  45|      null|  null|
|Purnima|  30|         5| 20050|
|Mousumi|null|      null|  null|
|   null|  30|         7| 25000|
| Salman|  35|         8| 65000|
+-------+----+----------+------+



In [6]:
# For handaling null value and removing any column from the dataset

df.na.drop().show()

+-------+----+----------+------+
|   Name|Age |Experience|Salary|
+-------+----+----------+------+
| Dipjol|  50|        10| 30000|
|Nasrin |  40|         8| 20000|
|Purnima|  30|         5| 20050|
| Salman|  35|         8| 65000|
+-------+----+----------+------+



In [7]:
#For removing any kind of null value from the dataset

df.na.drop(how="any").show()

+-------+----+----------+------+
|   Name|Age |Experience|Salary|
+-------+----+----------+------+
| Dipjol|  50|        10| 30000|
|Nasrin |  40|         8| 20000|
|Purnima|  30|         5| 20050|
| Salman|  35|         8| 65000|
+-------+----+----------+------+



In [8]:
# If all the value of a row is null, then it will be deleted.

df.na.drop(how="all").show()

+-------+----+----------+------+
|   Name|Age |Experience|Salary|
+-------+----+----------+------+
| Dipjol|  50|        10| 30000|
|Nasrin |  40|         8| 20000|
|   Riaz|null|         2| 10000|
|Shabnur|  45|      null|  null|
|Purnima|  30|         5| 20050|
|Mousumi|null|      null|  null|
|   null|  30|         7| 25000|
| Salman|  35|         8| 65000|
+-------+----+----------+------+



In [9]:

df.na.drop(how="all", thresh=3).show()

+-------+----+----------+------+
|   Name|Age |Experience|Salary|
+-------+----+----------+------+
| Dipjol|  50|        10| 30000|
|Nasrin |  40|         8| 20000|
|   Riaz|null|         2| 10000|
|Purnima|  30|         5| 20050|
|   null|  30|         7| 25000|
| Salman|  35|         8| 65000|
+-------+----+----------+------+



In [10]:
df.na.drop(how="any", subset=['Salary']).show()

+-------+----+----------+------+
|   Name|Age |Experience|Salary|
+-------+----+----------+------+
| Dipjol|  50|        10| 30000|
|Nasrin |  40|         8| 20000|
|   Riaz|null|         2| 10000|
|Purnima|  30|         5| 20050|
|   null|  30|         7| 25000|
| Salman|  35|         8| 65000|
+-------+----+----------+------+



In [11]:
# For handaling null value in any specific column using Imputed method. Here, for Age, Experience and Salary
# For performing Mean. Also possible to perform Median and Mode

from pyspark.ml.feature import Imputer
imputer = Imputer (
    inputCols=['Experience', 'Salary'],
    outputCols=["{}_imputed".format(c) for c in ['Experience', 'Salary']]
    ).setStrategy("mean")

In [12]:
#Add imputation Cols to df

imputer.fit(df).transform(df).show()

+-------+----+----------+------+------------------+--------------+
|   Name|Age |Experience|Salary|Experience_imputed|Salary_imputed|
+-------+----+----------+------+------------------+--------------+
| Dipjol|  50|        10| 30000|                10|         30000|
|Nasrin |  40|         8| 20000|                 8|         20000|
|   Riaz|null|         2| 10000|                 2|         10000|
|Shabnur|  45|      null|  null|                 6|         28341|
|Purnima|  30|         5| 20050|                 5|         20050|
|Mousumi|null|      null|  null|                 6|         28341|
|   null|  30|         7| 25000|                 7|         25000|
| Salman|  35|         8| 65000|                 8|         65000|
+-------+----+----------+------+------------------+--------------+

