## Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
import datetime as dt

import utils_05 as utils

%load_ext autoreload
%autoreload 2

## 01 Data Loading and Preprocessing

In [39]:
emp = utils.Employees()
emp.employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,16:47:00,101004,1.389,True,Client Services


In [40]:
emp.employees.info()  # Check data types and non-null counts

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   object        
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    object        
dtypes: bool(1), category(1), datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 49.1+ KB


## 02 Filter a DataFrame Based on a Condition

In [41]:
emp = utils.Employees()

In [43]:
# Filter rows with gender 'Male' using loc
male_employees = emp.employees.loc[emp.employees[emp.GENDER] == 'Male']
male_employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal


In [44]:
# Filter rows with gender 'Male' using loc and mask
male_mask = emp.employees[emp.GENDER] == 'Male'
male_employees = emp.employees.loc[male_mask]
male_employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal


In [47]:
# Filter all dates before 1985-01-01 using pd.Timestamp
employees = emp.employees.copy()
mask = employees["Start Date"] < pd.Timestamp("1985-01-01")
filtered_employees = employees.loc[mask]
filtered_employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
10,Louise,Female,1980-08-12,09:01:00,63241,15.132,True,
12,Brandon,Male,1980-12-01,01:08:00,112807,17.492,True,Human Resources
18,Diana,Female,1981-10-23,10:27:00,132940,19.082,False,Client Services
28,Terry,Male,1981-11-27,18:30:00,124008,13.464,True,Client Services
37,Linda,Female,1981-10-19,20:49:00,57427,9.557,True,Client Services


In [51]:
# Filter last login times before 12:00 PM
employees = emp.employees.copy()
mask = employees[emp.LAST_LOGIN_TIME] < dt.time(12, 0, 0)
filtered_employees = employees.loc[mask]
filtered_employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal
7,,Female,2015-07-20,10:43:00,45906,11.598,True,Finance
8,Angela,Female,2005-11-22,06:29:00,95570,18.523,True,Engineering


## 03 The `isin` Method

In [53]:
emp = utils.Employees()
employees = emp.employees.copy()

In [54]:
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,16:47:00,101004,1.389,True,Client Services


In [57]:
employees[emp.TEAM].unique()

array(['Marketing', nan, 'Finance', 'Client Services', 'Legal', 'Product',
       'Engineering', 'Business Development', 'Human Resources', 'Sales',
       'Distribution'], dtype=object)

In [58]:
# Find employees in Legal, Sales or Product teams
teams_of_interest = ["Legal", "Sales", "Product"]
mask = employees[emp.TEAM].isin(teams_of_interest)
filtered_employees = employees.loc[mask]
filtered_employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,16:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,15:19:00,102508,12.637,True,Legal
13,Gary,Male,2008-01-27,23:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,06:09:00,59414,1.256,False,Product


## 04 The `isnull` and `notnull` Methods

In [59]:
employees = emp.employees.copy()
employees.info()  # Check data types and non-null counts

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   object        
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    object        
dtypes: bool(1), category(1), datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 49.1+ KB


In [60]:
# Check for missing values manually 
for col in employees.columns:
    num_missing = employees[col].isnull().sum()
    print(f"Column '{col}' has {num_missing} missing values.")

Column 'First Name' has 67 missing values.
Column 'Gender' has 145 missing values.
Column 'Start Date' has 0 missing values.
Column 'Last Login Time' has 0 missing values.
Column 'Salary' has 0 missing values.
Column 'Bonus %' has 0 missing values.
Column 'Senior Management' has 0 missing values.
Column 'Team' has 43 missing values.


## 05 The `duplicated` Method

In [55]:
emp = utils.Employees()
employees = emp.employees.copy()
employees.shape

(1000, 8)

#### DEBUGGING

In [58]:
employees = employees.dropna(subset=[emp.FIRST_NAME])
employees.shape

(933, 8)

In [59]:
employees[emp.FIRST_NAME].info()

<class 'pandas.core.series.Series'>
Index: 933 entries, 0 to 999
Series name: First Name
Non-Null Count  Dtype 
--------------  ----- 
933 non-null    object
dtypes: object(1)
memory usage: 14.6+ KB


In [60]:
# Count occurrences of each first name
name_counts = employees[emp.FIRST_NAME].value_counts()

# Create a mask for names that appear more than once
mask_name_counts = name_counts > 1

# Get a list of duplicate names
duplicate_names_v1 = name_counts[mask_name_counts].index.to_list()
duplicate_names_v1[:5], len(duplicate_names_v1)  

(['Marilyn', 'Barbara', 'Jeremy', 'Todd', 'Steven'], 191)

In [61]:
# Create a mask for employees with duplicate first names
mask_duplicates_v1 = employees[emp.FIRST_NAME].isin(duplicate_names_v1)

# Filter employees with duplicate first names
names_duplicated_v1 = employees.loc[mask_duplicates_v1]
names_duplicated_v1

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,16:47:00,101004,1.389,True,Client Services
...,...,...,...,...,...,...,...,...
995,Henry,,2014-11-23,06:09:00,132483,16.655,False,Distribution
996,Phillip,Male,1984-01-31,06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,16:45:00,60500,11.985,False,Business Development


In [62]:
# Check manually that the first names are indeed duplicated
for index, row in names_duplicated_v1.iterrows():
    first_name = row[emp.FIRST_NAME]
    if first_name not in duplicate_names_v1:
        print(f"First name {first_name} is not duplicated!")

In [63]:
# Filter non-duplicate first names
mask_non_duplicates_v1 = ~mask_duplicates_v1
names_not_duplicated_v1 = employees.loc[mask_non_duplicates_v1]
names_not_duplicated_v1.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal
8,Angela,Female,2005-11-22,06:29:00,95570,18.523,True,Engineering
33,Jean,Female,1993-12-18,09:07:00,119082,16.18,False,Business Development
190,Carol,Female,1996-03-19,03:39:00,57783,9.129,False,Finance
291,Tammy,Female,1984-11-11,10:30:00,132839,17.463,True,Client Services


In [64]:
names_duplicated_v1.shape, names_not_duplicated_v1.shape

((924, 8), (9, 8))

In [68]:
# All rows where 'First Name' appears more than once (duplicated)
names_duplicated_v2 = employees[employees.duplicated(subset=emp.FIRST_NAME, keep=False)]

# Rows where 'First Name' appears only once (not duplicated)
names_not_duplicated_v2 = employees[~employees.duplicated(subset=emp.FIRST_NAME, keep=False)]

# Check shapes (should match your _v1: (924, 8) and (9, 8))
names_duplicated_v2.shape, names_not_duplicated_v2.shape

((924, 8), (9, 8))

In [69]:
names_not_duplicated_v2

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal
8,Angela,Female,2005-11-22,06:29:00,95570,18.523,True,Engineering
33,Jean,Female,1993-12-18,09:07:00,119082,16.18,False,Business Development
190,Carol,Female,1996-03-19,03:39:00,57783,9.129,False,Finance
291,Tammy,Female,1984-11-11,10:30:00,132839,17.463,True,Client Services
495,Eugene,Male,1984-05-24,10:54:00,81077,2.117,False,Sales
688,Brian,Male,2007-04-07,22:47:00,93901,17.821,True,Legal
832,Keith,Male,2003-02-12,15:02:00,120672,19.467,False,Legal
887,David,Male,2009-12-05,08:48:00,92242,15.407,False,Legal


In [70]:
# Get non-duplicate first names using drop_duplicates method and v3 extension
names_not_duplicated_v3 = employees.drop_duplicates(subset=emp.FIRST_NAME, keep=False)
names_not_duplicated_v3

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal
8,Angela,Female,2005-11-22,06:29:00,95570,18.523,True,Engineering
33,Jean,Female,1993-12-18,09:07:00,119082,16.18,False,Business Development
190,Carol,Female,1996-03-19,03:39:00,57783,9.129,False,Finance
291,Tammy,Female,1984-11-11,10:30:00,132839,17.463,True,Client Services
495,Eugene,Male,1984-05-24,10:54:00,81077,2.117,False,Sales
688,Brian,Male,2007-04-07,22:47:00,93901,17.821,True,Legal
832,Keith,Male,2003-02-12,15:02:00,120672,19.467,False,Legal
887,David,Male,2009-12-05,08:48:00,92242,15.407,False,Legal
