In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date","Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.head()

In [None]:
# note nulls, want actual dates! 
df.info()

In [None]:
df["Start Date"].head()

In [None]:
df["Start Date"] = pd.to_datetime(df["Start Date"])
df.head()

In [None]:
# note time columns will have the current dates date, to be ignored. 
df["Last Login Time"] = pd.to_datetime(df["Last Login Time"])
df.head()

In [None]:
df["Senior Management"] = df["Senior Management"].astype("bool")

In [None]:
df["Gender"] = df["Gender"].astype("category")

# Filter A `DataFrame` Based On A Condition

In [None]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date","Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.head()

In [None]:
df["Gender"] == "Male"
# filter to male rows
df[df["Gender"] == "Male"]

In [None]:
# filter to finance rows
mask = df["Team"] == "Finance"
df[mask]

In [None]:
mask = df["Senior Management"] == True
df[mask]

In [None]:
df["Team"] != "Marketing"
mask = df["Team"] != "Marketing"
df[mask]

In [None]:
df["Salary"] > 110000
df[df["Salary"] > 110000]

In [None]:
df[df["Bonus %"] < 1.5]

In [None]:
mask = df["Start Date"] <= "1985-01-01"
df[mask]

# Filter with  More than One Condition (AND)

In [None]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date","Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.head()

In [None]:
mask1 = df["Gender"] == "Male"
mask2 = df["Team"] == "Marketing"

df[mask1 & mask2].head()

# Filter with More than One Condition (OR)

In [None]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date","Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.head()

In [None]:
mask1 = df["Senior Management"] #already a bool
mask2 = df["Start Date"] < "1990-01-01"

df[mask1 | mask2] #either condition has to be true, or both

In [None]:
mask_name = df["First Name"] == "Robert"
mask_team = df["Team"] == "Client Services"
mask_date = df["Start Date"] > "2016-06-01"

df[(mask_name & mask_team) | mask_date]

# The `.isin()` Method

In [None]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date","Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.head()

In [None]:
# team is legal, sales or product
mask = df["Team"].isin(["Legal","Sales","Product"])
df[mask]

# The `.isnull()` and `.notnull()` Methods

In [None]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date","Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.head()

In [None]:
mask = df["Team"].isnull()
df[mask].head()

In [None]:
condition = df["Gender"].notnull()
df[condition]

# The `.between()` Method

In [None]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date","Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.head()

In [None]:
df[df["Salary"].between(60000, 70000)]

In [None]:
df[df["Bonus %"].between(2.0,5.0)]

In [None]:
df[df["Start Date"].between("1991-01-01","1992-01-01")].head()

In [None]:
df[df["Last Login Time"].between("08:30AM", "12:00PM")]

# The `.duplicated()` Method

In [None]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date","Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.sort_values("First Name", inplace = True)
df.head()

In [None]:
df["First Name"].duplicated()

In [None]:
# returns the duplicted rows 
df[df["First Name"].duplicated()] # default marks subsequent rows as duplicates, not first instance. 

In [None]:
df["First Name"].duplicated(keep = "last")
df["First Name"].duplicated(keep = False) # anyone that is duplictaed ever
mask = ~df["First Name"].duplicated(keep = False) #get unique values, rows without duplictes
df[mask]

# The `.drop_duplicates()` Method

In [None]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date","Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.sort_values("First Name", inplace = True)
df.head()

In [None]:
len(df)

In [None]:
df.drop_duplicates() #drops if full row is a duplicate. 
# will work on NULL/NaN the same
df.drop_duplicates(subset = ["First Name"], keep = "first")
df.drop_duplicates(subset = ["First Name", "Team"], keep = "first", inplace = True)
df.head()

# The `.unique` and `.nunique()` Methods

In [None]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date","Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.sort_values("First Name", inplace = True)
df.head()

In [None]:
df["Gender"].unique()

In [None]:
df["Team"].unique()

In [None]:
len(df["Team"].unique())

In [None]:
df["Team"].nunique() # does not count NULL/NaN, drops them.
df["Team"].nunique(dropna = False)