# Filtering a DataFrame based on a Condition

In [1]:
import pandas as pd

In [5]:
# Convert columns to different types to minimize the dataset size in memory
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-04-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-04-06 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2022-04-06 11:17:00,130590,11.858,False,Finance


In [6]:
# Create a boolean series to find rows that match a condition
# eg: Find all rows were "Gender" column == "Male"
df["Gender"] == "Male"

0       True
1       True
2      False
3       True
4       True
       ...  
995    False
996     True
997     True
998     True
999     True
Name: Gender, Length: 1000, dtype: bool

In [7]:
# Assign to a variable for easier/cleaner code.
# This can be done too: df[df["Gender"] == "Male"]
mask = df["Gender"] == "Male"

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-04-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-04-06 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2022-04-06 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-04-06 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2022-04-06 01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,2022-04-06 17:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,2022-04-06 06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,2022-04-06 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2022-04-06 16:45:00,60500,11.985,False,Business Development


In [8]:
# Enter the "mask" variable to filter the DataFrame where the boolean series is True
df[mask]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-04-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-04-06 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2022-04-06 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-04-06 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2022-04-06 01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,2022-04-06 17:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,2022-04-06 06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,2022-04-06 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2022-04-06 16:45:00,60500,11.985,False,Business Development


# Filtering Data with more than one condition
Create two boolean series and assign to variable

In [11]:
mask1 = df["Gender"] == "Male"
mask1

0       True
1       True
2      False
3       True
4       True
       ...  
995    False
996     True
997     True
998     True
999     True
Name: Gender, Length: 1000, dtype: bool

In [13]:
mask2 = df["Team"] == "Marketing"
mask2

0       True
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Name: Team, Length: 1000, dtype: bool

In [15]:
# Enter mask1, mask2 variable. This will extract all rows where both conditions are True
df[mask1 & mask2].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-04-06 12:42:00,97308,6.945,True,Marketing
21,Matthew,Male,1995-09-05,2022-04-06 02:12:00,100612,13.645,False,Marketing
26,Craig,Male,2000-02-27,2022-04-06 07:45:00,37598,7.757,True,Marketing
74,Thomas,Male,1995-06-04,2022-04-06 14:24:00,62096,17.029,False,Marketing
77,Charles,Male,2004-09-14,2022-04-06 20:13:00,107391,1.26,True,Marketing


In [16]:
# The pipe "|" symbol is "or". The following will extract all rows where either is True. 
df[mask1 | mask2]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-04-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-04-06 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2022-04-06 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-04-06 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2022-04-06 01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,2022-04-06 17:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,2022-04-06 06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,2022-04-06 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2022-04-06 16:45:00,60500,11.985,False,Business Development


In [22]:
### When there are more than 2 conditions, use parentheses to ensure proper order of operations (PEMDAS)
mask1 = df["First Name"] == "Robert"
mask2 = df["Team"] == "Client Services"
mask3 = df["Start Date"] > "2016-06-01"

In [23]:
df[(mask1 & mask2) | mask3]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
15,Lillian,Female,2016-06-05,2022-04-06 06:09:00,59414,1.256,False,Product
98,Tina,Female,2016-06-16,2022-04-06 19:47:00,100705,16.961,True,Marketing
387,Robert,Male,1994-10-29,2022-04-06 04:26:00,123294,19.894,False,Client Services
451,Terry,,2016-07-15,2022-04-06 00:29:00,140002,19.49,True,Marketing


# Filtering using the .isin() Method

In [24]:
# The .isin() method can shorten the following
mask1 = df["Team"]  == "Legal"
mask2 = df["Team"]  == "Sales"
mask3 = df["Team"]  == "Product"
df[mask1 | mask2 | mask3].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,2022-04-06 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2022-04-06 16:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,2022-04-06 15:19:00,102508,12.637,True,Legal
13,Gary,Male,2008-01-27,2022-04-06 23:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,2022-04-06 06:09:00,59414,1.256,False,Product


In [26]:
# Instead of multiple boolean series, use the .isin() method
mask4 = df["Team"].isin(["Legal", "Sales", "Product"])
mask4

0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997     True
998    False
999     True
Name: Team, Length: 1000, dtype: bool

In [27]:
df[mask4]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,2022-04-06 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2022-04-06 16:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,2022-04-06 15:19:00,102508,12.637,True,Legal
13,Gary,Male,2008-01-27,2022-04-06 23:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,2022-04-06 06:09:00,59414,1.256,False,Product
...,...,...,...,...,...,...,...,...
981,James,Male,1993-01-15,2022-04-06 17:19:00,148985,19.280,False,Legal
985,Stephen,,1983-07-10,2022-04-06 20:10:00,85668,1.909,False,Legal
989,Justin,,1991-02-10,2022-04-06 16:58:00,38344,3.794,False,Legal
997,Russell,Male,2013-05-20,2022-04-06 12:39:00,96914,1.421,False,Product


## Filtering with .isnull() and .notnull()

In [36]:
# Creates a boolean series where the value is "null"
condition1 = df["Team"].isnull()
condition1

0      False
1       True
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Name: Team, Length: 1000, dtype: bool

In [39]:
df[condition1].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2022-04-06 06:53:00,61933,4.17,True,
10,Louise,Female,1980-08-12,2022-04-06 09:01:00,63241,15.132,True,
23,,Male,2012-06-14,2022-04-06 16:19:00,125792,5.042,True,


In [37]:
# Creates a boolean series where the value is not null (opposite of isnull())
condition2 = df["Team"].notnull()
condition2

0       True
1      False
2       True
3       True
4       True
       ...  
995     True
996     True
997     True
998     True
999     True
Name: Team, Length: 1000, dtype: bool

In [38]:
df[condition2].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-04-06 12:42:00,97308,6.945,True,Marketing
2,Maria,Female,1993-04-23,2022-04-06 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-04-06 13:00:00,138705,9.34,True,Finance
