In [1]:
import pandas

In [2]:
# create a Pandas DataFrame based on data  from a dictionary ...
family = pandas.DataFrame(data = { "Name": ["Paul", "Phillip", "Lisa"], "Age": [52, 48, 46], "Sex": ["male", "male", "female"] })

In [3]:
# get summary information about the columns with numeric types
family.columns

Index(['Name', 'Age', 'Sex'], dtype='object')

In [4]:
# access just the Age column
family["Age"]

0    52
1    48
2    46
Name: Age, dtype: int64

In [5]:
# if we  get just one column then the result is a Series, rather than a DataFrame.
# A Pandas Series has only one column, while a Pandas DataFrame can have many columns
type(family["Age"])

pandas.core.series.Series

In [6]:
# find the smallest of all Ages
family["Age"].min()

46

In [7]:
# find the largest of all Ages
family["Age"].max()

52

In [10]:
# min also words on string values where we choose the first value alphabetically
family["Sex"].min()

'female'

In [None]:
# but we can't perform arithmetic summary operations like mean on string columns
family["Sex"].mean()

In [31]:
# retrieve a list of columns from a data frame
family[["Name", "Age"]]

Unnamed: 0,Name,Age
0,Paul,52
1,Phillip,48
2,Lisa,46


In [12]:
# the result is a Data Frame (not a Data Series)
type(family[["Name", "Age"]])

pandas.core.frame.DataFrame

In [15]:
# retrieve a list of columns of length one
family[["Age"]]

Unnamed: 0,Age
0,52
1,48
2,46


In [14]:
# the result is a Data Frame even if the list of columns has only one entry
type(family[["Age"]])

pandas.core.frame.DataFrame

In [17]:
# Test each row to determine whether Age is less than 50
# The result is a series of boolean values
family["Age"] < 50

0    False
1     True
2     True
Name: Age, dtype: bool

In [32]:
type(family["Age"] < 50)

pandas.core.series.Series

In [19]:
# store the result of these comparisions in a variable so we can use it to index into a Data Frame
young = family["Age"] < 50
young

0    False
1     True
2     True
Name: Age, dtype: bool

In [20]:
# retrieve only those rows from the family Data Frame where the young condition is True
family[young]

Unnamed: 0,Name,Age,Sex
1,Phillip,48,male
2,Lisa,46,female


In [22]:
# family.Age is equivalent to family["Age"]
family.Age < 50

0    False
1     True
2     True
Name: Age, dtype: bool

In [33]:
# We can try to combine our young condition with an is male condition
# But we get an error because the and operator only works on Boolean values, while the operands that we have provided are a series of Booleans
family.Age < 50 and family.Sex == "male"

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
# We can combine two series of Boolean values row-wise, but this attempt doesn't work because the & operator has higher precedence than the < operator
# So the compiler treats the code below as: family.Age < (50 & family.Sex) == "male"
family.Age < 50 & family.Sex == "male"

In [25]:
# So we need to insert parenthesis so that the two comparision operations get done first before we attempt to combine those series of Boolean values row-wise using the & operator
(family.Age < 50) & (family.Sex == "male")

0    False
1     True
2    False
dtype: bool

In [26]:
# Let's store it in a variable so we can use it to index into the family Data Frame
young_male = (family.Age < 50) & (family.Sex == "male")

In [27]:
# only one family member is both young and male
family[young_male]

Unnamed: 0,Name,Age,Sex
1,Phillip,48,male


In [28]:
# We don't need to store the series of Boolean values into a variable, we can use it directly when indexing into the family Data Frame
family[(family.Age < 50) & (family.Sex == "male")]

Unnamed: 0,Name,Age,Sex
1,Phillip,48,male


In [29]:
# Let's change the condition to young or male
young_or_male = (family.Age < 50) | (family.Sex == "male")

In [30]:
# it turns out that all existing family members are either young or male
family[young_or_male]

Unnamed: 0,Name,Age,Sex
0,Paul,52,male
1,Phillip,48,male
2,Lisa,46,female
