<a href="https://colab.research.google.com/github/jack-cao-623/python_learning/blob/main/dataframes_02_filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pandas DataFrames Part 2 -- Filtering

## Use up less memory by changing data types
- pd.to_datetime(df['col])
  - not like df['col'].astype()

In [1]:
# libraries needed
import numpy as np
import pandas as pd

In [2]:
# get data
df = pd.read_csv('https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/employees.csv')

# examine
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services


In [3]:
# data types and non-null counts
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   First Name         933 non-null    object 
 1   Gender             855 non-null    object 
 2   Start Date         1000 non-null   object 
 3   Last Login Time    1000 non-null   object 
 4   Salary             1000 non-null   int64  
 5   Bonus %            1000 non-null   float64
 6   Senior Management  933 non-null    object 
 7   Team               957 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


In [4]:
# number of rows and columns
df.shape

(1000, 8)

In [5]:
# convert Start Date column/Series to datetime
df['Start Date'] = pd.to_datetime(df['Start Date'])

In [6]:
# convert Last Login Time column/Series to datetime
df['Last Login Time'] = pd.to_datetime(df['Last Login Time'])
  # today's date before timestamp
  # TODO: how to remove this part

In [7]:
# convert Senior Management column/Series to bool
df['Senior Management'] = df['Senior Management'].astype('bool')

In [8]:
# convert Gender column/Series to category
df['Gender'] = df['Gender'].astype('category') 

In [9]:
# convert Team column/Series to category
df['Team'] = df['Team'].astype('category')

In [10]:
# view memory usage and confirm data type conversions
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    category      
dtypes: bool(1), category(2), datetime64[ns](2), float64(1), int64(1), object(1)
memory usage: 42.6+ KB


In [11]:
# in future sections, use this code to import data
df = pd.read_csv('https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/employees.csv', 
                 parse_dates = ['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category') 
df['Team'] = df['Team'].astype('category')

## Filter a DataFrame based on a condition

In [12]:
# load data and improve memory usage
df = pd.read_csv('https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/employees.csv', 
                 parse_dates = ['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category') 
df['Team'] = df['Team'].astype('category')

In [13]:
# only employees who are male
df[df['Gender'] == 'Male']

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-12-23 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-12-23 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2022-12-23 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-12-23 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2022-12-23 01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,2022-12-23 17:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,2022-12-23 06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,2022-12-23 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2022-12-23 16:45:00,60500,11.985,False,Business Development


In [14]:
# only Finance employees
df[df['Team'] == 'Finance']

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2022-12-23 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-12-23 13:00:00,138705,9.340,True,Finance
7,,Female,2015-07-20,2022-12-23 10:43:00,45906,11.598,True,Finance
14,Kimberly,Female,1999-01-14,2022-12-23 07:13:00,41426,14.543,True,Finance
46,Bruce,Male,2009-11-28,2022-12-23 22:47:00,114796,6.796,False,Finance
...,...,...,...,...,...,...,...,...
907,Elizabeth,Female,1998-07-27,2022-12-23 11:12:00,137144,10.081,False,Finance
954,Joe,Male,1980-01-19,2022-12-23 16:06:00,119667,1.148,True,Finance
987,Gloria,Female,2014-12-08,2022-12-23 05:08:00,136709,10.331,True,Finance
992,Anthony,Male,2011-10-16,2022-12-23 08:35:00,112769,11.625,True,Finance


In [15]:
# alternatively, save boolean series as its own variable
# do this if it gets unwiedly, i.e., lots of conditions

In [16]:
# Senior Managers only; this is already a bool
df[df['Senior Management']]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-12-23 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-12-23 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2022-12-23 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-12-23 16:47:00,101004,1.389,True,Client Services
6,Ruby,Female,1987-08-17,2022-12-23 16:20:00,65476,10.012,True,Product
...,...,...,...,...,...,...,...,...
991,Rose,Female,2002-08-25,2022-12-23 05:12:00,134505,11.051,True,Marketing
992,Anthony,Male,2011-10-16,2022-12-23 08:35:00,112769,11.625,True,Finance
993,Tina,Female,1997-05-15,2022-12-23 15:53:00,56450,19.040,True,Engineering
994,George,Male,2013-06-21,2022-12-23 17:47:00,98874,4.479,True,Marketing


In [17]:
# above is equivalent to
df[df['Senior Management'] == True]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-12-23 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-12-23 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2022-12-23 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-12-23 16:47:00,101004,1.389,True,Client Services
6,Ruby,Female,1987-08-17,2022-12-23 16:20:00,65476,10.012,True,Product
...,...,...,...,...,...,...,...,...
991,Rose,Female,2002-08-25,2022-12-23 05:12:00,134505,11.051,True,Marketing
992,Anthony,Male,2011-10-16,2022-12-23 08:35:00,112769,11.625,True,Finance
993,Tina,Female,1997-05-15,2022-12-23 15:53:00,56450,19.040,True,Engineering
994,George,Male,2013-06-21,2022-12-23 17:47:00,98874,4.479,True,Marketing


In [18]:
# non-marketing employees
df[df['Team'] != 'Marketing']

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2022-12-23 06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,2022-12-23 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-12-23 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-12-23 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2022-12-23 01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
995,Henry,,2014-11-23,2022-12-23 06:09:00,132483,16.655,False,Distribution
996,Phillip,Male,1984-01-31,2022-12-23 06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,2022-12-23 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2022-12-23 16:45:00,60500,11.985,False,Business Development


In [19]:
# employees whose salary is greater than 110k
df[df['Salary'] > 110_000]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2022-12-23 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-12-23 13:00:00,138705,9.340,True,Finance
5,Dennis,Male,1987-04-18,2022-12-23 01:35:00,115163,10.125,False,Legal
9,Frances,Female,2002-08-08,2022-12-23 06:51:00,139852,7.524,True,Business Development
12,Brandon,Male,1980-12-01,2022-12-23 01:08:00,112807,17.492,True,Human Resources
...,...,...,...,...,...,...,...,...
987,Gloria,Female,2014-12-08,2022-12-23 05:08:00,136709,10.331,True,Finance
991,Rose,Female,2002-08-25,2022-12-23 05:12:00,134505,11.051,True,Marketing
992,Anthony,Male,2011-10-16,2022-12-23 08:35:00,112769,11.625,True,Finance
995,Henry,,2014-11-23,2022-12-23 06:09:00,132483,16.655,False,Distribution


In [20]:
# employees whose bonus percentage is less than 1.5
df[df['Bonus %'] < 1.5]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
4,Larry,Male,1998-01-24,2022-12-23 16:47:00,101004,1.389,True,Client Services
15,Lillian,Female,2016-06-05,2022-12-23 06:09:00,59414,1.256,False,Product
58,Theresa,Female,2010-04-11,2022-12-23 07:18:00,72670,1.481,True,Engineering
77,Charles,Male,2004-09-14,2022-12-23 20:13:00,107391,1.26,True,Marketing
175,Willie,Male,1998-02-17,2022-12-23 20:20:00,146651,1.451,True,Engineering
189,Clarence,Male,1998-05-02,2022-12-23 03:16:00,85700,1.215,False,Sales
217,Douglas,Male,1999-09-03,2022-12-23 16:00:00,83341,1.015,True,Client Services
273,Nicholas,Male,1994-04-12,2022-12-23 20:21:00,74669,1.113,True,Product
279,Ruby,Female,2000-11-08,2022-12-23 19:35:00,105946,1.139,False,Business Development
365,Gloria,,1983-07-19,2022-12-23 01:57:00,140885,1.113,False,Human Resources


In [21]:
# employees who started on or before Jan 1, 1985
df[df['Start Date'] <= '1985-01-01']

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
10,Louise,Female,1980-08-12,2022-12-23 09:01:00,63241,15.132,True,
12,Brandon,Male,1980-12-01,2022-12-23 01:08:00,112807,17.492,True,Human Resources
18,Diana,Female,1981-10-23,2022-12-23 10:27:00,132940,19.082,False,Client Services
28,Terry,Male,1981-11-27,2022-12-23 18:30:00,124008,13.464,True,Client Services
37,Linda,Female,1981-10-19,2022-12-23 20:49:00,57427,9.557,True,Client Services
...,...,...,...,...,...,...,...,...
982,Rose,Female,1982-04-06,2022-12-23 10:43:00,91411,8.639,True,Human Resources
983,John,Male,1982-12-23,2022-12-23 22:35:00,146907,11.738,False,Engineering
985,Stephen,,1983-07-10,2022-12-23 20:10:00,85668,1.909,False,Legal
986,Donna,Female,1982-11-26,2022-12-23 07:04:00,82871,17.999,False,Marketing


## Filtering based on multiple conditions: &
- Make use of parentheses or store each condition as its own variable

In [22]:
# load data and improve memory usage
df = pd.read_csv('https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/employees.csv', 
                 parse_dates = ['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category') 
df['Team'] = df['Team'].astype('category')

In [28]:
# Male Marketing employees
(
    df[
        (df['Gender'] == 'Male') & (df['Team'] == 'Marketing')       # Gender is Male and Team is Marketing; need parentheses; can also save as own variables, each boolean mask
    ]
)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-12-23 12:42:00,97308,6.945,True,Marketing
21,Matthew,Male,1995-09-05,2022-12-23 02:12:00,100612,13.645,False,Marketing
26,Craig,Male,2000-02-27,2022-12-23 07:45:00,37598,7.757,True,Marketing
74,Thomas,Male,1995-06-04,2022-12-23 14:24:00,62096,17.029,False,Marketing
77,Charles,Male,2004-09-14,2022-12-23 20:13:00,107391,1.26,True,Marketing
101,Aaron,Male,2012-02-17,2022-12-23 10:20:00,61602,11.849,True,Marketing
104,John,Male,1989-12-23,2022-12-23 07:01:00,80740,19.305,False,Marketing
112,Willie,Male,2003-11-27,2022-12-23 06:21:00,64363,4.023,False,Marketing
119,Paul,Male,2008-06-03,2022-12-23 15:05:00,41054,12.299,False,Marketing
150,Sean,Male,1996-05-04,2022-12-23 20:59:00,135490,19.934,False,Marketing


## Filtering based on multiple conditions: |
- vertical pipe/line | means "or"