<a href="https://colab.research.google.com/github/jack-cao-623/python_learning/blob/main/dataframes_02_filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pandas DataFrames Part 2 -- Filtering

## Use up less memory by changing data types
- pd.to_datetime(df['col])
  - not like df['col'].astype()

In [2]:
# libraries needed
import numpy as np
import pandas as pd

In [3]:
# get data
df = pd.read_csv('https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/employees.csv')

# examine
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services


In [4]:
# data types and non-null counts
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   First Name         933 non-null    object 
 1   Gender             855 non-null    object 
 2   Start Date         1000 non-null   object 
 3   Last Login Time    1000 non-null   object 
 4   Salary             1000 non-null   int64  
 5   Bonus %            1000 non-null   float64
 6   Senior Management  933 non-null    object 
 7   Team               957 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


In [5]:
# number of rows and columns
df.shape

(1000, 8)

In [6]:
# convert Start Date column/Series to datetime
df['Start Date'] = pd.to_datetime(df['Start Date'])

In [7]:
# convert Last Login Time column/Series to datetime
df['Last Login Time'] = pd.to_datetime(df['Last Login Time'])
  # today's date before timestamp
  # TODO: how to remove this part

In [8]:
# convert Senior Management column/Series to bool
df['Senior Management'] = df['Senior Management'].astype('bool')

In [9]:
# convert Gender column/Series to category
df['Gender'] = df['Gender'].astype('category') 

In [10]:
# convert Team column/Series to category
df['Team'] = df['Team'].astype('category')

In [11]:
# view memory usage and confirm data type conversions
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    category      
dtypes: bool(1), category(2), datetime64[ns](2), float64(1), int64(1), object(1)
memory usage: 42.6+ KB


In [12]:
# in future sections, use this code to import data
df = pd.read_csv('https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/employees.csv', 
                 parse_dates = ['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category') 
df['Team'] = df['Team'].astype('category')

## Filter a DataFrame based on a condition

In [13]:
# load data and improve memory usage
df = pd.read_csv('https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/employees.csv', 
                 parse_dates = ['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category') 
df['Team'] = df['Team'].astype('category')

In [14]:
# only employees who are male
df[df['Gender'] == 'Male']

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-12-24 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-12-24 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2022-12-24 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-12-24 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2022-12-24 01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,2022-12-24 17:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,2022-12-24 06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,2022-12-24 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2022-12-24 16:45:00,60500,11.985,False,Business Development


In [15]:
# only Finance employees
df[df['Team'] == 'Finance']

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2022-12-24 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-12-24 13:00:00,138705,9.340,True,Finance
7,,Female,2015-07-20,2022-12-24 10:43:00,45906,11.598,True,Finance
14,Kimberly,Female,1999-01-14,2022-12-24 07:13:00,41426,14.543,True,Finance
46,Bruce,Male,2009-11-28,2022-12-24 22:47:00,114796,6.796,False,Finance
...,...,...,...,...,...,...,...,...
907,Elizabeth,Female,1998-07-27,2022-12-24 11:12:00,137144,10.081,False,Finance
954,Joe,Male,1980-01-19,2022-12-24 16:06:00,119667,1.148,True,Finance
987,Gloria,Female,2014-12-08,2022-12-24 05:08:00,136709,10.331,True,Finance
992,Anthony,Male,2011-10-16,2022-12-24 08:35:00,112769,11.625,True,Finance


In [16]:
# alternatively, save boolean series as its own variable
# do this if it gets unwiedly, i.e., lots of conditions

In [17]:
# Senior Managers only; this is already a bool
df[df['Senior Management']]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-12-24 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-12-24 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2022-12-24 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-12-24 16:47:00,101004,1.389,True,Client Services
6,Ruby,Female,1987-08-17,2022-12-24 16:20:00,65476,10.012,True,Product
...,...,...,...,...,...,...,...,...
991,Rose,Female,2002-08-25,2022-12-24 05:12:00,134505,11.051,True,Marketing
992,Anthony,Male,2011-10-16,2022-12-24 08:35:00,112769,11.625,True,Finance
993,Tina,Female,1997-05-15,2022-12-24 15:53:00,56450,19.040,True,Engineering
994,George,Male,2013-06-21,2022-12-24 17:47:00,98874,4.479,True,Marketing


In [18]:
# above is equivalent to
df[df['Senior Management'] == True]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-12-24 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-12-24 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2022-12-24 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-12-24 16:47:00,101004,1.389,True,Client Services
6,Ruby,Female,1987-08-17,2022-12-24 16:20:00,65476,10.012,True,Product
...,...,...,...,...,...,...,...,...
991,Rose,Female,2002-08-25,2022-12-24 05:12:00,134505,11.051,True,Marketing
992,Anthony,Male,2011-10-16,2022-12-24 08:35:00,112769,11.625,True,Finance
993,Tina,Female,1997-05-15,2022-12-24 15:53:00,56450,19.040,True,Engineering
994,George,Male,2013-06-21,2022-12-24 17:47:00,98874,4.479,True,Marketing


In [19]:
# non-marketing employees
df[df['Team'] != 'Marketing']

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2022-12-24 06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,2022-12-24 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-12-24 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-12-24 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2022-12-24 01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
995,Henry,,2014-11-23,2022-12-24 06:09:00,132483,16.655,False,Distribution
996,Phillip,Male,1984-01-31,2022-12-24 06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,2022-12-24 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2022-12-24 16:45:00,60500,11.985,False,Business Development


In [20]:
# employees whose salary is greater than 110k
df[df['Salary'] > 110_000]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2022-12-24 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-12-24 13:00:00,138705,9.340,True,Finance
5,Dennis,Male,1987-04-18,2022-12-24 01:35:00,115163,10.125,False,Legal
9,Frances,Female,2002-08-08,2022-12-24 06:51:00,139852,7.524,True,Business Development
12,Brandon,Male,1980-12-01,2022-12-24 01:08:00,112807,17.492,True,Human Resources
...,...,...,...,...,...,...,...,...
987,Gloria,Female,2014-12-08,2022-12-24 05:08:00,136709,10.331,True,Finance
991,Rose,Female,2002-08-25,2022-12-24 05:12:00,134505,11.051,True,Marketing
992,Anthony,Male,2011-10-16,2022-12-24 08:35:00,112769,11.625,True,Finance
995,Henry,,2014-11-23,2022-12-24 06:09:00,132483,16.655,False,Distribution


In [21]:
# employees whose bonus percentage is less than 1.5
df[df['Bonus %'] < 1.5]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
4,Larry,Male,1998-01-24,2022-12-24 16:47:00,101004,1.389,True,Client Services
15,Lillian,Female,2016-06-05,2022-12-24 06:09:00,59414,1.256,False,Product
58,Theresa,Female,2010-04-11,2022-12-24 07:18:00,72670,1.481,True,Engineering
77,Charles,Male,2004-09-14,2022-12-24 20:13:00,107391,1.26,True,Marketing
175,Willie,Male,1998-02-17,2022-12-24 20:20:00,146651,1.451,True,Engineering
189,Clarence,Male,1998-05-02,2022-12-24 03:16:00,85700,1.215,False,Sales
217,Douglas,Male,1999-09-03,2022-12-24 16:00:00,83341,1.015,True,Client Services
273,Nicholas,Male,1994-04-12,2022-12-24 20:21:00,74669,1.113,True,Product
279,Ruby,Female,2000-11-08,2022-12-24 19:35:00,105946,1.139,False,Business Development
365,Gloria,,1983-07-19,2022-12-24 01:57:00,140885,1.113,False,Human Resources


In [22]:
# employees who started on or before Jan 1, 1985
df[df['Start Date'] <= '1985-01-01']

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
10,Louise,Female,1980-08-12,2022-12-24 09:01:00,63241,15.132,True,
12,Brandon,Male,1980-12-01,2022-12-24 01:08:00,112807,17.492,True,Human Resources
18,Diana,Female,1981-10-23,2022-12-24 10:27:00,132940,19.082,False,Client Services
28,Terry,Male,1981-11-27,2022-12-24 18:30:00,124008,13.464,True,Client Services
37,Linda,Female,1981-10-19,2022-12-24 20:49:00,57427,9.557,True,Client Services
...,...,...,...,...,...,...,...,...
982,Rose,Female,1982-04-06,2022-12-24 10:43:00,91411,8.639,True,Human Resources
983,John,Male,1982-12-23,2022-12-24 22:35:00,146907,11.738,False,Engineering
985,Stephen,,1983-07-10,2022-12-24 20:10:00,85668,1.909,False,Legal
986,Donna,Female,1982-11-26,2022-12-24 07:04:00,82871,17.999,False,Marketing


## Filtering based on multiple conditions: &
- Make use of parentheses or store each condition as its own variable

In [23]:
# load data and improve memory usage
df = pd.read_csv('https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/employees.csv', 
                 parse_dates = ['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category') 
df['Team'] = df['Team'].astype('category')

In [24]:
# Male Marketing employees
(
    df[
        (df['Gender'] == 'Male') & (df['Team'] == 'Marketing')       # Gender is Male and Team is Marketing; need parentheses; can also save as own variables, each boolean mask
    ]
)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-12-24 12:42:00,97308,6.945,True,Marketing
21,Matthew,Male,1995-09-05,2022-12-24 02:12:00,100612,13.645,False,Marketing
26,Craig,Male,2000-02-27,2022-12-24 07:45:00,37598,7.757,True,Marketing
74,Thomas,Male,1995-06-04,2022-12-24 14:24:00,62096,17.029,False,Marketing
77,Charles,Male,2004-09-14,2022-12-24 20:13:00,107391,1.26,True,Marketing
101,Aaron,Male,2012-02-17,2022-12-24 10:20:00,61602,11.849,True,Marketing
104,John,Male,1989-12-23,2022-12-24 07:01:00,80740,19.305,False,Marketing
112,Willie,Male,2003-11-27,2022-12-24 06:21:00,64363,4.023,False,Marketing
119,Paul,Male,2008-06-03,2022-12-24 15:05:00,41054,12.299,False,Marketing
150,Sean,Male,1996-05-04,2022-12-24 20:59:00,135490,19.934,False,Marketing


## Filtering based on multiple conditions: |
- vertical pipe/line | means "or"

In [25]:
# load data and improve memory usage
df = pd.read_csv('https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/employees.csv', 
                 parse_dates = ['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category') 
df['Team'] = df['Team'].astype('category')

In [26]:
# Senior Managemment or started before Jan 1, 1990
(
    df[
        (df['Senior Management']) | (df['Start Date'] < '1990-01-01')
    ]
)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-12-24 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-12-24 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2022-12-24 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-12-24 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2022-12-24 01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
992,Anthony,Male,2011-10-16,2022-12-24 08:35:00,112769,11.625,True,Finance
993,Tina,Female,1997-05-15,2022-12-24 15:53:00,56450,19.040,True,Engineering
994,George,Male,2013-06-21,2022-12-24 17:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,2022-12-24 06:30:00,42392,19.675,False,Finance


In [27]:
# (name of robert and team of client services) OR (start date after june 1, 2016)

# bool masks
mask_robert = df['First Name'] == 'Robert'
mask_client_services = df['Team'] == 'Client Services'
mask_start_date = df['Start Date'] > '2016-06-01'

# filter
(
    df[
        (mask_robert & mask_client_services) | (mask_start_date)
    ]
)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
15,Lillian,Female,2016-06-05,2022-12-24 06:09:00,59414,1.256,False,Product
98,Tina,Female,2016-06-16,2022-12-24 19:47:00,100705,16.961,True,Marketing
387,Robert,Male,1994-10-29,2022-12-24 04:26:00,123294,19.894,False,Client Services
451,Terry,,2016-07-15,2022-12-24 00:29:00,140002,19.49,True,Marketing


## Checking for inclusion with .isin() method
- Equivalent to SQL IN; don't have to write out multiple booleans and string them togethr with | (OR)
- Need to input a list [] into .isin() method, so pd.Series(...).isin([x1, x2, ...])

In [28]:
# load data and improve memory usage
df = pd.read_csv('https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/employees.csv', 
                 parse_dates = ['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category') 
df['Team'] = df['Team'].astype('category')

In [29]:
# Team is Legal, Sales, or Product
df['Team'].isin(['Legal', 'Sales', 'Product'])
  # equivalent to:
    # df['Team'] == 'Legal' | df['Team'] == 'Sales' | df['Team'] == 'Product'

0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997     True
998    False
999     True
Name: Team, Length: 1000, dtype: bool

In [30]:
# filter to employees in Legal, Sales, or Product
df[df['Team'].isin(['Legal', 'Sales', 'Product'])]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,2022-12-24 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2022-12-24 16:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,2022-12-24 15:19:00,102508,12.637,True,Legal
13,Gary,Male,2008-01-27,2022-12-24 23:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,2022-12-24 06:09:00,59414,1.256,False,Product
...,...,...,...,...,...,...,...,...
981,James,Male,1993-01-15,2022-12-24 17:19:00,148985,19.280,False,Legal
985,Stephen,,1983-07-10,2022-12-24 20:10:00,85668,1.909,False,Legal
989,Justin,,1991-02-10,2022-12-24 16:58:00,38344,3.794,False,Legal
997,Russell,Male,2013-05-20,2022-12-24 12:39:00,96914,1.421,False,Product


## .isnull() and .notnull() methods
- generates a boolean series; SQL IS NULL, SQL IS NOT NULL

In [31]:
# load data and improve memory usage
df = pd.read_csv('https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/employees.csv', 
                 parse_dates = ['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category') 
df['Team'] = df['Team'].astype('category')

In [32]:
# filter to employees whose team is missing
df[df['Team'].isnull()]
  # df['Team'].isnull() produces a boolean mask

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2022-12-24 06:53:00,61933,4.17,True,
10,Louise,Female,1980-08-12,2022-12-24 09:01:00,63241,15.132,True,
23,,Male,2012-06-14,2022-12-24 16:19:00,125792,5.042,True,
32,,Male,1998-08-21,2022-12-24 14:27:00,122340,6.417,True,
91,James,,2005-01-26,2022-12-24 23:00:00,128771,8.309,False,
109,Christopher,Male,2000-04-22,2022-12-24 10:15:00,37919,11.449,False,
139,,Female,1990-10-03,2022-12-24 01:08:00,132373,10.527,True,
199,Jonathan,Male,2009-07-17,2022-12-24 08:15:00,130581,16.736,True,
258,Michael,Male,2002-01-24,2022-12-24 03:04:00,43586,12.659,False,
290,Jeremy,Male,1988-06-14,2022-12-24 18:20:00,129460,13.657,True,


In [33]:
# employees whose gender isn't missing
df[df['Gender'].notnull()]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-12-24 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-12-24 06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,2022-12-24 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-12-24 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-12-24 16:47:00,101004,1.389,True,Client Services
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,2022-12-24 17:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,2022-12-24 06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,2022-12-24 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2022-12-24 16:45:00,60500,11.985,False,Business Development


## .between() method
- SQL BETWEEN X AND Y, inclusive

In [34]:
# load data and improve memory usage
df = pd.read_csv('https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/employees.csv', 
                 parse_dates = ['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category') 
df['Team'] = df['Team'].astype('category')

In [35]:
# employees whose salaries are between 60k and 70k
(
    df[
        df['Salary'].between(60_000, 70_000)
    ]
)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2022-12-24 06:53:00,61933,4.170,True,
6,Ruby,Female,1987-08-17,2022-12-24 16:20:00,65476,10.012,True,Product
10,Louise,Female,1980-08-12,2022-12-24 09:01:00,63241,15.132,True,
20,Lois,,1995-04-22,2022-12-24 19:18:00,64714,4.934,True,Legal
41,Christine,,2015-06-28,2022-12-24 01:08:00,66582,11.308,True,Business Development
...,...,...,...,...,...,...,...,...
965,Catherine,Female,1989-09-25,2022-12-24 01:31:00,68164,18.393,False,Client Services
970,Alice,Female,1988-09-03,2022-12-24 20:54:00,63571,15.397,True,Product
974,Harry,Male,2011-08-30,2022-12-24 18:31:00,67656,16.455,True,Client Services
978,Sean,Male,1983-01-17,2022-12-24 14:23:00,66146,11.178,False,Human Resources


In [36]:
# employees whose bonus percentage is between 2% and 5%
(
    df[
        df['Bonus %'].between(2.0, 5.0)
    ]
    .sort_values(by = 'Bonus %', ascending = False)
)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
343,Ronald,Male,2009-02-24,2022-12-24 14:09:00,96633,4.990,True,Engineering
79,Bonnie,Female,1988-11-13,2022-12-24 15:30:00,115814,4.990,False,Product
204,Willie,Male,2006-06-06,2022-12-24 09:45:00,55281,4.935,True,Marketing
20,Lois,,1995-04-22,2022-12-24 19:18:00,64714,4.934,True,Legal
840,Lillian,Female,2002-08-26,2022-12-24 08:53:00,103854,4.924,True,Distribution
...,...,...,...,...,...,...,...,...
594,Louis,Male,2011-04-15,2022-12-24 05:02:00,95198,2.075,False,Business Development
834,Carl,,1982-02-11,2022-12-24 07:54:00,49325,2.071,True,Business Development
754,Lisa,Female,2007-04-11,2022-12-24 01:04:00,128042,2.030,True,Legal
486,Howard,Male,2012-04-09,2022-12-24 06:36:00,37984,2.021,False,Distribution


In [37]:
# employees who started in 1991
(
    df[
        df['Start Date'].between('1991-01-01', '1991-12-31')    # possible between Start Date is datetime
    ]
    .sort_values(by = 'Start Date', ascending = False)
)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
328,Samuel,Male,1991-12-19,2022-12-24 02:54:00,76076,5.319,True,Finance
88,Donna,Female,1991-11-27,2022-12-24 13:59:00,64088,6.155,True,Legal
339,Michael,Male,1991-11-22,2022-12-24 12:57:00,98753,16.443,True,Human Resources
895,Janice,,1991-11-19,2022-12-24 18:02:00,139791,16.968,False,Business Development
603,Carl,,1991-10-26,2022-12-24 08:11:00,100888,12.49,True,Business Development
172,Sara,Female,1991-09-23,2022-12-24 18:17:00,97058,9.402,False,Finance
537,Cynthia,Female,1991-09-14,2022-12-24 13:08:00,51633,13.472,True,Business Development
704,Thomas,Male,1991-09-07,2022-12-24 09:51:00,65251,11.211,False,Distribution
552,Barbara,Female,1991-09-02,2022-12-24 15:41:00,127297,11.905,True,Product
891,Timothy,Male,1991-08-25,2022-12-24 10:37:00,92587,8.475,False,Finance


In [38]:
df['First Name'].between('A', 'Z')   # can do this with strings

# this does work below; because can't do it with categories
# df['Team'].between('A', 'Z')

0      True
1      True
2      True
3      True
4      True
       ... 
995    True
996    True
997    True
998    True
999    True
Name: First Name, Length: 1000, dtype: bool

## .duplicated() method
- Inverse of SQL DISTINCT
- tilde (~) to negate to get DISTINCT

In [39]:
# load data and improve memory usage
df = pd.read_csv('https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/employees.csv', 
                 parse_dates = ['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category') 
df['Team'] = df['Team'].astype('category')

In [40]:
# sort by first name ASC
df = df.sort_values(by = 'First Name', ascending = True)
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2022-12-24 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2022-12-24 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2022-12-24 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2022-12-24 19:39:00,63126,18.424,False,Client Services
137,Adam,Male,2011-05-21,2022-12-24 01:45:00,95327,15.12,False,Distribution


In [41]:
df['First Name'].duplicated()   # default is to mark first occurence as not duplicated and subsequent occurences as duplicated

101    False
327     True
440     True
937     True
137    False
       ...  
902     True
925     True
946     True
947     True
951     True
Name: First Name, Length: 1000, dtype: bool

In [42]:
# first names that appear more than once; excludes first instance of each
df[df['First Name'].duplicated()]
  # default parameter and argument is keep = 'first'

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
327,Aaron,Male,1994-01-29,2022-12-24 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2022-12-24 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2022-12-24 19:39:00,63126,18.424,False,Client Services
141,Adam,Male,1990-12-24,2022-12-24 20:57:00,110194,14.727,True,Product
302,Adam,Male,2007-07-05,2022-12-24 11:59:00,71276,5.027,True,Human Resources
...,...,...,...,...,...,...,...,...
902,,Male,2001-05-23,2022-12-24 19:52:00,103877,6.322,True,Distribution
925,,Female,2000-08-23,2022-12-24 16:19:00,95866,19.388,True,Sales
946,,Female,1985-09-15,2022-12-24 01:50:00,133472,16.941,True,Distribution
947,,Male,2012-07-30,2022-12-24 15:07:00,107351,5.329,True,Marketing


In [43]:
# mark last occurence as non-duplicate, i.e., False; default is 'first'
df['First Name'].duplicated(keep = 'last')

101     True
327     True
440     True
937    False
137     True
       ...  
902     True
925     True
946     True
947     True
951    False
Name: First Name, Length: 1000, dtype: bool

In [44]:
# mark any duplicated value as duplicate, i.e., True
df['First Name'].duplicated(keep = False)

101    True
327    True
440    True
937    True
137    True
       ... 
902    True
925    True
946    True
947    True
951    True
Name: First Name, Length: 1000, dtype: bool

In [45]:
# return only employees with unique first names

df['First Name'].duplicated(keep = False)    # True means duplicate; False means means unique

# want to negate the above; use tilde ~

df[~(df['First Name'].duplicated(keep = False))]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2022-12-24 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2022-12-24 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2022-12-24 03:39:00,57783,9.129,False,Finance
887,David,Male,2009-12-05,2022-12-24 08:48:00,92242,15.407,False,Legal
5,Dennis,Male,1987-04-18,2022-12-24 01:35:00,115163,10.125,False,Legal
495,Eugene,Male,1984-05-24,2022-12-24 10:54:00,81077,2.117,False,Sales
33,Jean,Female,1993-12-18,2022-12-24 09:07:00,119082,16.18,False,Business Development
832,Keith,Male,2003-02-12,2022-12-24 15:02:00,120672,19.467,False,Legal
291,Tammy,Female,1984-11-11,2022-12-24 10:30:00,132839,17.463,True,Client Services


In [46]:
# only employees with unique first names
emp_first_name_once = (
    df['First Name']                  # First Name series
      .value_counts()                 # count how many times each first name appears
      .loc[lambda x: x == 1]          # filter to first names that appear just once; but can use this to change filter to names that appear n times
      .index                          # get those names
)

# filter
df[df['First Name'].isin(emp_first_name_once)]

# result is same as above, but above is cleaner

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2022-12-24 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2022-12-24 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2022-12-24 03:39:00,57783,9.129,False,Finance
887,David,Male,2009-12-05,2022-12-24 08:48:00,92242,15.407,False,Legal
5,Dennis,Male,1987-04-18,2022-12-24 01:35:00,115163,10.125,False,Legal
495,Eugene,Male,1984-05-24,2022-12-24 10:54:00,81077,2.117,False,Sales
33,Jean,Female,1993-12-18,2022-12-24 09:07:00,119082,16.18,False,Business Development
832,Keith,Male,2003-02-12,2022-12-24 15:02:00,120672,19.467,False,Legal
291,Tammy,Female,1984-11-11,2022-12-24 10:30:00,132839,17.463,True,Client Services


## .drop_duplicates() method
- SQL DISTINT, similar ish, but not really
- subset parameter; argument is list of columns/Series
- keep parameter; argument is 'first', 'last', or False

In [47]:
# load data and improve memory usage
df = pd.read_csv('https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/employees.csv', 
                 parse_dates = ['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category') 
df['Team'] = df['Team'].astype('category')

In [48]:
# number of rows in df
len(df)

1000

In [49]:
# still 1k rows
len(df.drop_duplicates())
  # because .drop_duplicates() only drops duplicated rows; across all columns/Series

1000

In [53]:
# drop rows where First Name repeats, but keep first occurence
(
    df
      .drop_duplicates(subset = ['First Name'], keep = 'first')  # keep could be 'first', 'last', or False (bool)
      .sort_values(by = 'First Name')
)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2022-12-24 10:20:00,61602,11.849,True,Marketing
137,Adam,Male,2011-05-21,2022-12-24 01:45:00,95327,15.120,False,Distribution
53,Alan,,2014-03-03,2022-12-24 13:28:00,40341,17.578,True,Finance
372,Albert,Male,1997-02-01,2022-12-24 16:20:00,67827,19.717,True,Engineering
425,Alice,Female,1986-05-02,2022-12-24 01:50:00,51395,2.378,True,Finance
...,...,...,...,...,...,...,...,...
433,Wanda,Female,2008-07-20,2022-12-24 13:44:00,65362,7.132,True,Legal
177,Wayne,Male,2012-04-07,2022-12-24 08:00:00,102652,14.085,True,Distribution
127,William,Male,2002-09-29,2022-12-24 16:09:00,66521,5.830,False,Human Resources
112,Willie,Male,2003-11-27,2022-12-24 06:21:00,64363,4.023,False,Marketing


In [55]:
# .drop_duplicates() apply to missing values
df['Team'].value_counts(dropna = False)

# no unique teams; would be indicated by count of 1

Client Services         106
Finance                 102
Business Development    101
Marketing                98
Product                  95
Sales                    94
Engineering              92
Human Resources          91
Distribution             90
Legal                    88
NaN                      43
Name: Team, dtype: int64

In [56]:
df.drop_duplicates(subset = ['Team'], keep = False)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team


## .unique() and .nunique() methods

In [57]:
# load data and improve memory usage
df = pd.read_csv('https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/employees.csv', 
                 parse_dates = ['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category') 
df['Team'] = df['Team'].astype('category')

In [58]:
# unique values in Gender
df['Gender'].unique()

['Male', 'Female', NaN]
Categories (2, object): ['Female', 'Male']

In [61]:
# number of unique values in Gender; by default excludes missing
df['Gender'].nunique(dropna = True)

2

In [62]:
# unique teams
df['Team'].unique()

['Marketing', NaN, 'Finance', 'Client Services', 'Legal', ..., 'Engineering', 'Business Development', 'Human Resources', 'Sales', 'Distribution']
Length: 11
Categories (10, object): ['Business Development', 'Client Services', 'Distribution', 'Engineering', ...,
                          'Legal', 'Marketing', 'Product', 'Sales']

In [64]:
# number of unique teams, including missing
df['Team'].nunique(dropna = False)

11

In [65]:
# .nunique() is faster/cleaner way of:
len(df['Team'].unique()) 
  # note that this includes missing values, which .nunique() excludes by default; but can change with dropna parameter

11