# Pyspark DataFrames
- Filter Operation
- &,|, ==
- ~

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('dataframe').getOrCreate()

In [3]:
df = spark.read.csv('test3.csv', header = True, inferSchema=True)
df.show()


+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|shambhu| 33|        10| 20000|
| mahesh| 34|         8| 35000|
| natraj| 35|         5| 25000|
| kishor| 28|         3| 30000|
| vishal| 29|         5| 15000|
|vaibhav| 27|         2| 18000|
+-------+---+----------+------+



In [5]:
df.createOrReplaceTempView('new')

In [8]:
spark.sql("""select count(distinct age) from new""").show()

+-------------------+
|count(DISTINCT age)|
+-------------------+
|                  6|
+-------------------+



# Filter Operation

In [6]:
## Salary of the people less than or equl to 20000 

df.filter('Salary <= 20000').show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|shambhu| 33|        10| 20000|
| vishal| 29|         5| 15000|
|vaibhav| 27|         2| 18000|
+-------+---+----------+------+



In [7]:
df.filter('Salary <= 20000').select(['Name', 'age']).show()

+-------+---+
|   Name|age|
+-------+---+
|shambhu| 33|
| vishal| 29|
|vaibhav| 27|
+-------+---+



In [8]:
df.filter(df['Salary'] <= 20000).show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|shambhu| 33|        10| 20000|
| vishal| 29|         5| 15000|
|vaibhav| 27|         2| 18000|
+-------+---+----------+------+



In [9]:
df.filter((df['Salary'] <= 20000) & (df['Salary'] >= 15000)).show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|shambhu| 33|        10| 20000|
| vishal| 29|         5| 15000|
|vaibhav| 27|         2| 18000|
+-------+---+----------+------+



In [5]:
## NOT Operation 

df.filter(~(df['Salary'] <= 20000)).show()

+------+---+----------+------+
|  Name|age|Experience|Salary|
+------+---+----------+------+
|mahesh| 34|         8| 35000|
|natraj| 35|         5| 25000|
|kishor| 28|         3| 30000|
+------+---+----------+------+



In [6]:
input_data = [("Shivansh", "Data Scientist", "Noida"),
              (None, "Software Developer", None),
              ("Swati", "Data Analyst", "Hyderabad"),
              (None, None, "Noida"),
              ("Arpit", "Android Developer", "Banglore"),
              (None, None, None)]

schema = ["Name", "Job Profile", "City"]
df = spark.createDataFrame(input_data, schema)
df.show()

+--------+------------------+---------+
|    Name|       Job Profile|     City|
+--------+------------------+---------+
|Shivansh|    Data Scientist|    Noida|
|    NULL|Software Developer|     NULL|
|   Swati|      Data Analyst|Hyderabad|
|    NULL|              NULL|    Noida|
|   Arpit| Android Developer| Banglore|
|    NULL|              NULL|     NULL|
+--------+------------------+---------+



In [7]:
df.filter(df.Name.isNull() | df.City.isNull()).count()

3

In [8]:
df.filter(df.Name.isNull()).count()

3

In [9]:
df.filter(df.City.isNull()).count()

2

In [10]:
spark.stop()