# Introduction to Pandas

Pandas documentation: http://pandas.pydata.org/pandas-docs/stable/ (homepage: https://pandas.pydata.org/)

Notes from Chapter 3 of https://jakevdp.github.io/PythonDataScienceHandbook/

In [1]:
import numpy as np
import pandas as pd
np.__version__, pd.__version__

('1.26.4', '2.2.3')

In [3]:
# show
def show(data, show_data = 0):
    print ("  Type: {:}".format (type(data)))
    print (" Index: {:}".format(data.index))
    if type(data) == pd.core.frame.DataFrame:
        print ("Columns: {:}".format(data.columns))
    print (" Shape: {:}".format(data.shape))
    if show_data:
        print("  Data: {:}".format(data.values))

## Pandas Series Objects

In [4]:
# Create a Pandas Series object - uses the default (implicit) index
d1 = pd.Series([0.25, 0.5, 0.75, 1.0])
show(d1, 1)

  Type: <class 'pandas.core.series.Series'>
 Index: RangeIndex(start=0, stop=4, step=1)
 Shape: (4,)
  Data: [0.25 0.5  0.75 1.  ]


In [5]:
d1

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [6]:
# So, what exactly is a Pandas Series?
# See the data types for the Series and its components
type(d1), type(d1.values), type(d1.index)

(pandas.core.series.Series,
 numpy.ndarray,
 pandas.core.indexes.range.RangeIndex)

In [7]:
# What is this pandas.core.series?
dir(pd.core.series)

['ABCDataFrame',
 'ABCSeries',
 'Any',
 'Appender',
 'CachedAccessor',
 'Callable',
 'CategoricalAccessor',
 'CategoricalDtype',
 'ChainedAssignmentError',
 'CombinedDatetimelikeProperties',
 'DatetimeIndex',
 'ExtensionArray',
 'ExtensionDtype',
 'Hashable',
 'INFO_DOCSTRING',
 'IO',
 'Index',
 'InvalidIndexError',
 'Iterable',
 'ListAccessor',
 'Literal',
 'LossySetitemError',
 'Mapping',
 'MultiIndex',
 'NDFrame',
 'PYPY',
 'PeriodIndex',
 'REF_COUNT',
 'Sequence',
 'Series',
 'SeriesApply',
 'SeriesInfo',
 'SingleArrayManager',
 'SingleBlockManager',
 'SparseAccessor',
 'SparseDtype',
 'StringDtype',
 'StringMethods',
 'StructAccessor',
 'Substitution',
 'TYPE_CHECKING',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_chained_assignment_method_msg',
 '_chained_assignment_msg',
 '_check_cacher',
 '_coerce_method',
 '_get_option',
 '_shared_doc_kwargs',
 '_shared_docs',
 'algorithms',
 'annotations',
 'a

In [8]:
# pandas.core indexes?
dir(pd.core.indexes)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'accessors',
 'api',
 'base',
 'category',
 'datetimelike',
 'datetimes',
 'extension',
 'frozen',
 'interval',
 'multi',
 'period',
 'range',
 'timedeltas']

In [9]:
# pandas.core.indexes.range
dir(pd.core.indexes.range)

['ABCTimedeltaIndex',
 'Any',
 'Callable',
 'Hashable',
 'Index',
 'Iterator',
 'Literal',
 'RangeIndex',
 'TYPE_CHECKING',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_dtype_int64',
 '_empty_range',
 'annotations',
 'cache_readonly',
 'cast',
 'com',
 'deprecate_nonkeyword_arguments',
 'doc',
 'ensure_platform_int',
 'ensure_python_int',
 'extract_array',
 'getsizeof',
 'ibase',
 'is_float',
 'is_integer',
 'is_scalar',
 'is_signed_integer_dtype',
 'lib',
 'libindex',
 'maybe_extract_name',
 'no_default',
 'np',
 'nv',
 'operator',
 'ops',
 'overload',
 'timedelta',
 'unique_deltas',
 'unpack_zerodim_and_defer']

Note that we did not explicitly define an index -- the index was created implicitly when we created the Series object.

### Series as Generalized NumPy Arrays and Generalized Python Dictionaries

In [10]:
# Series as generalized NumPy arrays
# Create the Series with the specfied (explicit) index values
# The "generalization" here is that we are defining a custom index.
d2 = pd.Series([0.25, 0.5, 0.75, 1.0],
               index=['a', 'b', 'c', 'd'])
show(d2, 1)

  Type: <class 'pandas.core.series.Series'>
 Index: Index(['a', 'b', 'c', 'd'], dtype='object')
 Shape: (4,)
  Data: [0.25 0.5  0.75 1.  ]


In [11]:
d2

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [12]:
d2[3]
# As expected, this is the 4th element, with (implicit) index 3
# This might trigger a FutureWarning message
# All Python libraries do need update and maintenance.
# FutureWarning message gives user a headsup what change will be made in the future. 
# However, under current useage, we are on good hand. 

  d2[3]


1.0

In [13]:
### this will avoid the FutureWarning message. 
d2.iloc[3]

1.0

In [14]:
# Slicing - Note that element 3 is not inlcuded (as we expect)
d2[1:3]

b    0.50
c    0.75
dtype: float64

In [15]:
### Although the above cell doesn't have any FutureWarning message. 
### It's always good to follow the updated coding habit. 
d2.iloc[1:3]

b    0.50
c    0.75
dtype: float64

In [16]:
# Using the explicit index, we directly access the values using the defined index (similar to a dictionary)
d2['d']

1.0

In [17]:
# Note that element 'c' is included here - More on this below (loc, iloc)
d2['b':'c']

b    0.50
c    0.75
dtype: float64

In [18]:
# Note that the indices don't have to be numerical or "alphabetic" in the sequence-sense
pets = pd.Series(['dog', 'cat', 'fish', 'hamster'],
               index=['best', 'worst', 'useless', 'why'])
show(pets, 1)
pets

  Type: <class 'pandas.core.series.Series'>
 Index: Index(['best', 'worst', 'useless', 'why'], dtype='object')
 Shape: (4,)
  Data: ['dog' 'cat' 'fish' 'hamster']


best           dog
worst          cat
useless       fish
why        hamster
dtype: object

In [19]:
# access single object
pets['useless'], type(pets['useless'])

('fish', str)

In [20]:
# slice
pets['useless':]

useless       fish
why        hamster
dtype: object

In [21]:
type(pets['useless':])

pandas.core.series.Series

In [26]:
# Try the code below, this can be confusing.
pets['best':'useless'], pets.iloc[0 : 3]

(best        dog
 worst       cat
 useless    fish
 dtype: object,
 best        dog
 worst       cat
 useless    fish
 dtype: object)

#### Creating a Series from a Dictionary

In [27]:
# Creating a Series from a dictionary - keys -> Index; values -> Values.  
# Note that that Pandas object used to sort the series using the dictionary key sort order.
# As of version 0.23 -- it now maintains the order specified in the
# dict defintion -- see the documentation for details)
population_dict = {'California': 38332521,
                   'Texas'     : 26448193,
                   'New York'  : 19651127,
                   'Florida'   : 19552860,
                   'Illinois'  : 12882135}
population = pd.Series(population_dict)
show(population, 1)

  Type: <class 'pandas.core.series.Series'>
 Index: Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
 Shape: (5,)
  Data: [38332521 26448193 19651127 19552860 12882135]


In [28]:
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [29]:
# Note again that the Pandas object no longer stores in sort order.
sorted(population_dict.keys())

['California', 'Florida', 'Illinois', 'New York', 'Texas']

In [30]:
# Looks like a dictionary ....
population['Illinois']

12882135

In [31]:
# but with list-like slicing
population['Florida':'Illinois']

Florida     19552860
Illinois    12882135
dtype: int64

In [32]:
# along with "regular" slicing
population[3:5]

Florida     19552860
Illinois    12882135
dtype: int64

In [33]:
# Create a second series with geographical area
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
show(area,1)
area

  Type: <class 'pandas.core.series.Series'>
 Index: Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
 Shape: (5,)
  Data: [423967 695662 141297 170312 149995]


California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [34]:
# So, now we have two series that use the same index for different categories
# of values (population, area)
print("{:}\n\n{:}".format(population, area))
# Hopefully the direction is clear --> let's combine these using meaningful names.

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64


## Pandas DataFrame Objects

A Pandas DataFrame is a two-dimensional, mutable, and heterogeneous tabular data structure. It's one of the most fundamental and widely used data structures in the Pandas library for Python, designed for efficient data manipulation and analysis.

Each dataframe column can hold a different data type (e.g., numbers, text, dates, booleans), making it "heterogeneous.

Unlike NumPy arrays, DataFrames have both row and column labels (also known as indices).

In [35]:
# DataFrame is the ticket
states = pd.DataFrame({'population': population,'area': area})
show(states,1)

  Type: <class 'pandas.core.frame.DataFrame'>
 Index: Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
Columns: Index(['population', 'area'], dtype='object')
 Shape: (5, 2)
  Data: [[38332521   423967]
 [26448193   695662]
 [19651127   141297]
 [19552860   170312]
 [12882135   149995]]


In [36]:
# Or in a user-friendly display
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [38]:
# Closer look at the DataFrame components and respective data types (note that these are they things
# that are shown using the show() function.)
states.index, type(states.index[1])

(Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object'),
 str)

In [39]:
# All column names
states.columns

Index(['population', 'area'], dtype='object')

In [41]:
# All values
states.values, type(states.values)

(array([[38332521,   423967],
        [26448193,   695662],
        [19651127,   141297],
        [19552860,   170312],
        [12882135,   149995]], dtype=int64),
 numpy.ndarray)

In [42]:
type(states), type(states.index), type(states.columns), type(states.values)
# Note here that both the index and the column sequence are defined as index objects

(pandas.core.frame.DataFrame,
 pandas.core.indexes.base.Index,
 pandas.core.indexes.base.Index,
 numpy.ndarray)

In [43]:
# extract one of the two Series using its column name
states['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [44]:
# now the other Series ...
states['population']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: population, dtype: int64

In [45]:
# and the respective types
type(states), type(states['area']), type(states['population'])

(pandas.core.frame.DataFrame,
 pandas.core.series.Series,
 pandas.core.series.Series)

In [46]:
# let's add another colum -- this time of non-numeric data
color_dict = {'California': 'blue', 'Texas': 'red', 'New York': 'blue',
             'Florida': 'purple', 'Illinois': 'purple'}
color = pd.Series(color_dict)
show(color,1)
color

  Type: <class 'pandas.core.series.Series'>
 Index: Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
 Shape: (5,)
  Data: ['blue' 'red' 'blue' 'purple' 'purple']


California      blue
Texas            red
New York        blue
Florida       purple
Illinois      purple
dtype: object

In [47]:
# add our new column to the DataFrame
states['color'] = color.values
show(states,1)

  Type: <class 'pandas.core.frame.DataFrame'>
 Index: Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
Columns: Index(['population', 'area', 'color'], dtype='object')
 Shape: (5, 3)
  Data: [[38332521 423967 'blue']
 [26448193 695662 'red']
 [19651127 141297 'blue']
 [19552860 170312 'purple']
 [12882135 149995 'purple']]


In [48]:
# user-friendly form
states

Unnamed: 0,population,area,color
California,38332521,423967,blue
Texas,26448193,695662,red
New York,19651127,141297,blue
Florida,19552860,170312,purple
Illinois,12882135,149995,purple


In [49]:
# since NumPy arrays are homogeneous, the values array is now of type 'object'
states.values, type(states.values)

(array([[38332521, 423967, 'blue'],
        [26448193, 695662, 'red'],
        [19651127, 141297, 'blue'],
        [19552860, 170312, 'purple'],
        [12882135, 149995, 'purple']], dtype=object),
 numpy.ndarray)

In [50]:
# but the individual series datatypes reflect the series data types.
states['area'].values, states['population'].values, states['color'].values

(array([423967, 695662, 141297, 170312, 149995], dtype=int64),
 array([38332521, 26448193, 19651127, 19552860, 12882135], dtype=int64),
 array(['blue', 'red', 'blue', 'purple', 'purple'], dtype=object))

## Pandas Index Objects

From VP 03.01 - "The index object ... can be thought of as an immutable array or an ordered set (technically as a multi-set, as Index objects can have repeat values).

In [51]:
ind1 = states.index
ind1, type(ind1)

(Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object'),
 pandas.core.indexes.base.Index)

In [52]:
# Array-type operations
ind1[2:]

Index(['New York', 'Florida', 'Illinois'], dtype='object')

In [53]:
ind1[-2:]

Index(['Florida', 'Illinois'], dtype='object')

In [54]:
ind1[::2]

Index(['California', 'New York', 'Illinois'], dtype='object')

In [55]:
# immutable
ind1[1] = 'Alabama'

TypeError: Index does not support mutable operations

In [56]:
# Set operations -- union, intersection, etc.
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])

In [58]:
indA & indB  # intersection
# Depending on your version of Python, may cause a "Deprecation Warning"

# a bitwise AND operation: https://www.geeksforgeeks.org/python/python-bitwise-operators/
# Please read the above link and explain the results in the DF 5

Index([0, 3, 5, 7, 9], dtype='int64')

In [59]:
indA.intersection(indB)

Index([3, 5, 7], dtype='int64')

In [60]:
indA | indB  # union
# a bitwise OR operation: https://www.geeksforgeeks.org/python/python-bitwise-operators/
# Please read the above link and explain the results in the DF 5

Index([3, 3, 5, 7, 11], dtype='int64')

In [61]:
indA.union(indB)

Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [None]:
indA ^ indB  # symmetric difference

In [None]:
indA.symmetric_difference(indB)

## Indexers - loc and iloc

<span style="color:red">This is important and confusing -- at first</span>

In [62]:
# Consider this example from VanderPlas
data = pd.Series(['a', 'b', 'c', 'd'], index=[1, 3, 5, 7])
show(data)
data

  Type: <class 'pandas.core.series.Series'>
 Index: Index([1, 3, 5, 7], dtype='int64')
 Shape: (4,)


1    a
3    b
5    c
7    d
dtype: object

In [65]:
# Explicit index - note that with a zero-based indexing, we 
# would expect the second element of the list.
data[1], data.iloc[0]

('a', 'a')

In [68]:
# Implicit indexing when slicing - now we get the second and third 
# elements.
data[1:3], data.iloc[1: 3]

(3    b
 5    c
 dtype: object,
 3    b
 5    c
 dtype: object)

This can cause confusion -- when is explicit indexing used and when is implicit indexing used ... Hence, loc and iloc.

In [69]:
# loc - always uses the explicit index 
data.loc[1]

'a'

In [70]:
data.loc[1:3]

1    a
3    b
dtype: object

In [71]:
# iloc - always uses the implicit Python-style index
data.iloc[1]

'b'

In [72]:
data.iloc[1:3]

3    b
5    c
dtype: object

In [73]:
# Note that even with non-integer indexes the slicing behavior is still different
data1 = pd.Series(['a', 'b', 'c', 'd'], index=['h','i','j','k'])
data1

h    a
i    b
j    c
k    d
dtype: object

In [74]:
data1[1:3]

i    b
j    c
dtype: object

In [75]:
data1.iloc[1:3]

i    b
j    c
dtype: object

In [77]:
data1.loc['i':'j']

i    b
j    c
dtype: object

## Data selection in a DataFrame

In [78]:
# Recreate our states DataFrame ...
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
color = pd.Series({'California': 'blue', 'Texas': 'red',
                 'New York': 'blue', 'Florida': 'red',
                 'Illinois': 'purple'})
data = pd.DataFrame({'area':area, 'pop':pop, 'color':color})
show(data)
data

  Type: <class 'pandas.core.frame.DataFrame'>
 Index: Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
Columns: Index(['area', 'pop', 'color'], dtype='object')
 Shape: (5, 3)


Unnamed: 0,area,pop,color
California,423967,38332521,blue
Texas,695662,26448193,red
New York,141297,19651127,blue
Florida,170312,19552860,red
Illinois,149995,12882135,purple


In [79]:
# dictionary style indexing -- picking columns
data['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [80]:
# works in cases where column names are character-based and
# are not reserve works
data.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [82]:
# What is the column that is returned
type(data.area), type(data['area'])

(pandas.core.series.Series, pandas.core.series.Series)

In [83]:
# lists of columns -- note that you can specify the order and note
# the double brackets.
data[['pop', 'area']]

Unnamed: 0,pop,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [84]:
# and what is the type of the returned object?
type(data[['pop', 'area']])

pandas.core.frame.DataFrame

In [85]:
# Add a new series using existing series
data['density'] = data['pop'] / data['area']
data
# Note the ufunc (universal function) behavior

Unnamed: 0,area,pop,color,density
California,423967,38332521,blue,90.413926
Texas,695662,26448193,red,38.01874
New York,141297,19651127,blue,139.076746
Florida,170312,19552860,red,114.806121
Illinois,149995,12882135,purple,85.883763


In [86]:
# loc and iloc do element selection and slicing (rows)
data.iloc[1,2]

'red'

In [87]:
data.iloc[:3, :2]

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [88]:
data.iloc[1:3]

Unnamed: 0,area,pop,color,density
Texas,695662,26448193,red,38.01874
New York,141297,19651127,blue,139.076746


In [89]:
# Oops -- not using numerical index!
data.loc[1:3]

TypeError: cannot do slice indexing on Index with these indexers [1] of type int

In [90]:
# Notice that this slice includes the "ending row" where the
# iloc-based slicing did not (it used standard Python slicing)
data.loc['Texas':'Florida']

Unnamed: 0,area,pop,color,density
Texas,695662,26448193,red,38.01874
New York,141297,19651127,blue,139.076746
Florida,170312,19552860,red,114.806121


In [91]:
# Masking
data[data.density > 100]
# what does 'data.density > 100' return (it's an expression that
# is evaluated)?

Unnamed: 0,area,pop,color,density
New York,141297,19651127,blue,139.076746
Florida,170312,19552860,red,114.806121


In [93]:
# Masking
data.density > 100, type(data.density > 100)

(California    False
 Texas         False
 New York       True
 Florida        True
 Illinois      False
 Name: density, dtype: bool,
 pandas.core.series.Series)

In [94]:
# when in doubt, check it out!
data.density > 100

California    False
Texas         False
New York       True
Florida        True
Illinois      False
Name: density, dtype: bool

In [95]:
data[(data.density > 100) & (data.area < 150000)]

Unnamed: 0,area,pop,color,density
New York,141297,19651127,blue,139.076746


In [96]:
# Masking + column selection
data.loc[data.density > 100, ['color', 'density']]

Unnamed: 0,color,density
New York,blue,139.076746
Florida,red,114.806121


In [97]:
# let's look at the object
type(data.loc[data.density > 100, ['color', 'density']])

pandas.core.frame.DataFrame

In [98]:
# lets create a named slice
high_density = data.loc[data.density > 100, ['color', 'density']]
high_density

Unnamed: 0,color,density
New York,blue,139.076746
Florida,red,114.806121


In [99]:
# and change something
high_density.iloc[0,0] = "purple"
high_density

Unnamed: 0,color,density
New York,purple,139.076746
Florida,red,114.806121


In [100]:
# what about the original?
data

Unnamed: 0,area,pop,color,density
California,423967,38332521,blue,90.413926
Texas,695662,26448193,red,38.01874
New York,141297,19651127,blue,139.076746
Florida,170312,19552860,red,114.806121
Illinois,149995,12882135,purple,85.883763


Hmmmm ... So the slice is a copy and not a view.  What are the rules here (for Pandas slices)?
https://stackoverflow.com/questions/23296282/what-rules-does-pandas-use-to-generate-a-view-vs-a-copy

Please read the above link and post your answer to DF 5 for this week. 

## See Data Indexing and Selection Examples notebook for more examples of indexing and data selection.

## Handling Missing Data

In [101]:
data = pd.Series([1, np.nan, 'hello', None])
show(data)
data

  Type: <class 'pandas.core.series.Series'>
 Index: RangeIndex(start=0, stop=4, step=1)
 Shape: (4,)


0        1
1      NaN
2    hello
3     None
dtype: object

In [102]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [103]:
# masking
data[data.notnull()]
# this will get ride of the missing/null values

0        1
2    hello
dtype: object

In [104]:
# remove the missing data elements
data.dropna()

0        1
2    hello
dtype: object

In [105]:
# dropna() created a new Series object -- the original is unchanged
data

0        1
1      NaN
2    hello
3     None
dtype: object