In [1]:
import pandas as pd
import numpy as np
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.appName('practice').getOrCreate()

In [4]:
spark

In [8]:
df_pyspark=spark.read.csv('test1.csv',header=True,inferSchema=True)

In [9]:
df_pyspark.show()

+-------+----+----+
|   name| age| exp|
+-------+----+----+
| harsha|  25|   2|
| bhalla|  25|   3|
| naveen|  27|   5|
|pradeep|  24|null|
|harshit|  23|   5|
|   null|  25|   6|
|     kk|null|   4|
|    kll|null|   3|
+-------+----+----+



In [12]:
df_pyspark.describe().show()

+-------+-------+------------------+------------------+
|summary|   name|               age|               exp|
+-------+-------+------------------+------------------+
|  count|      7|                 6|                 7|
|   mean|   null|24.833333333333332|               4.0|
| stddev|   null| 1.329160135825126|1.4142135623730951|
|    min| bhalla|                23|                 2|
|    max|pradeep|                27|                 6|
+-------+-------+------------------+------------------+



In [13]:
# dropping the column
df_pyspark.drop('name').show()

+----+----+
| age| exp|
+----+----+
|  25|   2|
|  25|   3|
|  27|   5|
|  24|null|
|  23|   5|
|  25|   6|
|null|   4|
|null|   3|
+----+----+



In [15]:
df_pyspark.show()

+-------+----+----+
|   name| age| exp|
+-------+----+----+
| harsha|  25|   2|
| bhalla|  25|   3|
| naveen|  27|   5|
|pradeep|  24|null|
|harshit|  23|   5|
|   null|  25|   6|
|     kk|null|   4|
|    kll|null|   3|
+-------+----+----+



In [19]:
df_pyspark.na.drop().show()

+-------+---+---+
|   name|age|exp|
+-------+---+---+
| harsha| 25|  2|
| bhalla| 25|  3|
| naveen| 27|  5|
|harshit| 23|  5|
+-------+---+---+



In [21]:
## how = any option will drop the row if any of the value in the row is null
## thresh value can be used to see how many valid values to be present in 
## a row for the row not to be dropped. 
df_pyspark.na.drop(how='any').show()

+-------+---+---+
|   name|age|exp|
+-------+---+---+
| harsha| 25|  2|
| bhalla| 25|  3|
| naveen| 27|  5|
|harshit| 23|  5|
+-------+---+---+



In [27]:
df_pyspark.na.fill('missing values',['name','age']).show()

+--------------+----+----+
|          name| age| exp|
+--------------+----+----+
|        harsha|  25|   2|
|        bhalla|  25|   3|
|        naveen|  27|   5|
|       pradeep|  24|null|
|       harshit|  23|   5|
|missing values|  25|   6|
|            kk|null|   4|
|           kll|null|   3|
+--------------+----+----+



In [28]:
df_pyspark.na.fill('missing values').printSchema()

root
 |-- name: string (nullable = false)
 |-- age: integer (nullable = true)
 |-- exp: integer (nullable = true)



In [32]:
# imputer function
from pyspark.ml.feature import Imputer
imputer = Imputer(
            inputCols=['age','exp'],
            outputCols=["{}_imputed".format(c) for c in ['age','exp']]
            ).setStrategy('mean')

In [33]:
imputer.fit(df_pyspark).transform(df_pyspark).show()

+-------+----+----+-----------+-----------+
|   name| age| exp|age_imputed|exp_imputed|
+-------+----+----+-----------+-----------+
| harsha|  25|   2|         25|          2|
| bhalla|  25|   3|         25|          3|
| naveen|  27|   5|         27|          5|
|pradeep|  24|null|         24|          4|
|harshit|  23|   5|         23|          5|
|   null|  25|   6|         25|          6|
|     kk|null|   4|         24|          4|
|    kll|null|   3|         24|          3|
+-------+----+----+-----------+-----------+



In [34]:
imputer = Imputer(
            inputCols=['age','exp'],
            outputCols=["{}_imputed".format(c) for c in ['age','exp']]
            ).setStrategy('median')

In [35]:
imputer.fit(df_pyspark).transform(df_pyspark).show()

+-------+----+----+-----------+-----------+
|   name| age| exp|age_imputed|exp_imputed|
+-------+----+----+-----------+-----------+
| harsha|  25|   2|         25|          2|
| bhalla|  25|   3|         25|          3|
| naveen|  27|   5|         27|          5|
|pradeep|  24|null|         24|          4|
|harshit|  23|   5|         23|          5|
|   null|  25|   6|         25|          6|
|     kk|null|   4|         25|          4|
|    kll|null|   3|         25|          3|
+-------+----+----+-----------+-----------+

