In [1]:
import numpy as np
import pandas as pd     


In [None]:
#Pandas Objects:

#Series: one-dimensional array of indexed data. Can be created from a list or array as follows:

data = pd.Series([0.25, 0.5 , 0.75 , 1.0])
data

#series combines a sequence of values with an explicit sequence of indices, which we can access with the values and index attributes. The values are simply a familiar NumPy array:

#left hand side: index. right hand side: values


RangeIndex(start=0, stop=4, step=1)

In [11]:
data.index
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [4]:
dataarray=np.array([0.25,0.5,0.5,1.0])
dataarray

array([0.25, 0.5 , 0.5 , 1.  ])

In [14]:
#just like in numpy array, data can be accessed by the associated index via the familiar python suare-braket notation:

data[0]
data[1:3]

1    0.50
2    0.75
dtype: float64

In [16]:
#difference: numpy array has implicitly defined integer index to access values, but pandas is explicityly defined index

#therefore, the explicit index definition gives series object additional capabilities like values consisting of any desired type

data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [18]:
data['b']

np.float64(0.5)

In [24]:
#we can even use noncontinuous or nonsequential indices:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                    index=[2,5,3,7])
data[5]

np.float64(0.5)

In [None]:
#creating series from dictionary:

population_dict = {'California': 395, 'Texas': 400,'Florida': 200,'New York': 2333,'Pennsylvania':39}
population = pd.Series(population_dict)
population

#CALLING PD.SERIES AS FUNCTION AND POPULATION_DICT AS OBJECT WHEN CREATING SERIES OFF OF THE DICTIONARY

California       395
Texas            400
Florida          200
New York        2333
Pennsylvania      39
dtype: int64

In [31]:
#from here, typical dictionary-style item access can be performed
population['California']

np.int64(395)

In [32]:
#unlike dictionary, the series supports array-style operations such as slicing:
population['California':'Florida']

California    395
Texas         400
Florida       200
dtype: int64

In [37]:
#CONSTRUCTING SERIES OBJECTS

#pd.Series(data, index=index)

#where index is optional argument, and data can be one of many entities
#ex: data can be a list, NumPy array, in which case index defaults to an integer sequence:

pd.Series([2,4,6])

#or data can be scalar, which is repeated to fill the specified index:

pd.Series(5, index=[100,200,300])

#or it an be a dictionary, in which case index will default to dictionary keys:
pd.Series({2:'a',1:'b',3:'c'})

#in each case, the index can be explicitly set to control the order or the subset of keys used:

pd.Series({2:'a',1:'b',3:'c'}, index = [1,2])

1    b
2    a
dtype: object

In [38]:
#dataframes:

area_dict = {'California': 423967, 'Texas': 695662, 'Florida': 170312,
             'New York': 141297, 'Pennsylvania': 119280}
area = pd.Series(area_dict)
area

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
dtype: int64

In [41]:
#now that we have this along with the population series from before, we can use the dictionary to constuct a single two-dimensional object containing this information:

states = pd.DataFrame({'population': population, 'area':area})
states

Unnamed: 0,population,area
California,395,423967
Texas,400,695662
Florida,200,170312
New York,2333,141297
Pennsylvania,39,119280


In [42]:
#like the series object, the dataframe has an index attribute that gives accss to the index labels:
states.index

Index(['California', 'Texas', 'Florida', 'New York', 'Pennsylvania'], dtype='object')

In [43]:
#additionally, the dataframe has a columns attribute, which is an index object holding the column labels
states.columns

Index(['population', 'area'], dtype='object')

In [44]:
#instead of a series mapping key to a value (like a dictionary), dataframe maps a column name to a series of column data. for example: referring to the 'area' attribut returns the series object containing the areas we saw earlier
states['area']

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
Name: area, dtype: int64

In [None]:
#distinguish: in a two-dimension numpy array, data[0] will return the first row. For a dataframe, data['col0'] will return the first columns. Because of this it's probably better to think about dataframes as generalized dictionaries rather than generalized arrays.


In [None]:
#contstructing dataframe objects:
#from a single series object:

pd.DataFrame(population,columns=['population'])

#the columns specification here names the column

Unnamed: 0,population
California,395
Texas,400
Florida,200
New York,2333
Pennsylvania,39


In [51]:
#making a datarame from a list of dics:

#simple list comprehension to create some data:

data = [{'a':i,'b':2*i}
        for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [53]:
#if some keys in the dictionary are missing, pandas will fill them in with NaN values 
pd.DataFrame([{'a':1,'b':2},{'b':3,'c':4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [None]:
#From a dictionary of series objects:
pd.DataFrame({'population':population,'area':area})

#will create columns

Unnamed: 0,population,area
California,395,423967
Texas,400,695662
Florida,200,170312
New York,2333,141297
Pennsylvania,39,119280


In [56]:
#From a two-dimensional NumPy Array:
#given a two-dimensional array of data, we can create a DataFrame with any specified column and index names. If omitted, an integer index will be used foreach:

pd.DataFrame(np.random.rand(3,2),
            columns=['foo','bar'],
            index=['a','b','c'])

Unnamed: 0,foo,bar
a,0.431027,0.001714
b,0.446944,0.427983
c,0.516583,0.81148


In [None]:
#From a NumPy structued array: reference notes

In [57]:
#Pandas Index Object:

ind = pd.Index([2,3,5,7,11])
ind

Index([2, 3, 5, 7, 11], dtype='int64')

In [58]:
#we can use standard python indexing notation to retrieve values or slices

ind[1]

np.int64(3)

In [59]:
ind[::2]

Index([2, 5, 11], dtype='int64')

In [60]:
#index objects also have many of the attributes similar from NumPy arrays:
print(ind.size,ind.shape,ind.ndim,ind.dtype)

5 (5,) 1 int64


In [61]:
#one difference between Index obects and NumPy arrays is that the indices are immutable - that is, they cannot be modified via the normal means:
ind[1]=0

TypeError: Index does not support mutable operations

In [66]:
#Index as Ordered Set

#pandas objects are deisgned to facilitate operations such as joins across datasets, which depend on many aspects of set arithmetic. The Index object follows many of the conventions used by python's built-in set data structure, so that unions, intersections, differences and other combinations can be computed in a familiar way:

indA = pd.Index([1,3,5,7,9])
indB = pd.Index([2,3,5,7,11])

In [None]:
indA.intersection(indB)

#intersection: to filter only ones in both

Index([3, 5, 7], dtype='int64')

In [68]:
indA.union(indB)

Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [72]:
indA.symmetric_difference(indB)

Index([1, 2, 9, 11], dtype='int64')

In [69]:
dir(indA)


['T',
 '__abs__',
 '__add__',
 '__and__',
 '__annotations__',
 '__array__',
 '__array_priority__',
 '__array_ufunc__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__firstlineno__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__init__',
 '__init_subclass__',
 '__invert__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pandas_priority__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdivmod__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rfloordiv__',
 '__rmod__',
 '__rmul__',
 '__ror__',
 '__rpow__',
 '__rsub__',
 '__rtruediv__',
 '__rxor__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__static_attributes__',
 '__str__',
 '__sub__',
 '__subclass

In [71]:
help(indA.unique)


Help on method unique in module pandas.core.indexes.base:

unique(level: 'Hashable | None' = None) -> 'Self' method of pandas.core.indexes.base.Index instance
    Return unique values in the index.

    Unique values are returned in order of appearance, this does NOT sort.

    Parameters
    ----------
    level : int or hashable, optional
        Only return values from specified level (for MultiIndex).
        If int, gets the level by integer position, else by level name.

    Returns
    -------
    Index

    See Also
    --------
    unique : Numpy array of unique values in that column.
    Series.unique : Return unique values of Series object.

    Examples
    --------
    >>> idx = pd.Index([1, 1, 2, 3, 3])
    >>> idx.unique()
    Index([1, 2, 3], dtype='int64')



In [76]:
#Data Selection in Series:
#a series object acts in many ways like a one-dimensional NumPy array, and in many ways like a standard Python dictionary. 

#Series as Dictionary: like a dictionary, the Series object provides a mapping from a collection of keys to a collection of values:

import pandas as pd
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                index=['a','b','c','d'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [78]:
data['b']

np.float64(0.5)

In [80]:
#we can also use dictionary-like Python expressions and methods to examine the keys/indices and values:
'a' in data

True

In [81]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [83]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [84]:
#Series objects can also be modified with a dictionary-like syntax. Just as you can extend a dictionary by assigning to a new key, you can extend a Series by assigning to a new index value.

data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [86]:
#series as a one-dimensional array
#a series builds on this dictionary-like interface and provides array-style item selection via the same basic mechanisms as NumPy arrays - that is, slices, masking, and fancing indexing:


#slicing by explicit index:
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [87]:
#sliving by implicit integer index:
data[0:2]

a    0.25
b    0.50
dtype: float64

In [89]:
#masking:
data[(data > 0.3) & (data < 0.8)]

b    0.50
c    0.75
dtype: float64

In [None]:
#fancy indexing:
data[['a','e']]

#of these, slicing may be the source of most confusion. When slicing an index, the final index is included in the slice - but when slicing with an implicit index, the final index is excluded from the slice.

a    0.25
e    1.25
dtype: float64

In [91]:
#Indexers: loc & iloc

#If your series has an explicit integer index, and indexing operating such as data[1] will use the explicit indices, while a slicing operation like data[1:3] will use the implicit Python-style indices:

data = pd.Series(['a','b','c'],index=[1,2,5])
data

1    a
2    b
5    c
dtype: object

In [93]:
#explicit index when indexing:
data[1]

'a'

In [94]:
#implicit index when slicing:
data[1:3]

2    b
5    c
dtype: object

In [104]:
#because of this potential confusion in the case of integer indexes, Pandas provides some special indexer attributes that explicitly expose certain indexing schemes. These are not functional methods, but attributes that expose a particular slicing interface to the data in the Series.

#First, the loc attribute allows indexing and slicing that always references the explicit index:

data.loc[1]
data.loc[1:3]

1    a
2    b
dtype: object

In [103]:
#the iloc attribute allows indexing and slicing that always referencing the implicit Python-style index:
data.iloc[1]
data.iloc[1:3]

2    b
5    c
dtype: object

In [4]:
import pandas as pd 
#Data Selection in DataFrames

#DataFrame as a Dictionary:
#the first analogy we will consider is the DataFrame as a dictionary of related Series objects:

area = pd.Series({'California': 423967, 'Texas': 695662,
                  'Florida': 170312, 'New York': 141297,
                  'Pennsylvania': 119280})
pop = pd.Series({'California': 39538223, 'Texas': 29145505,
                 'Florida': 21538187, 'New York': 20201249,
                 'Pennsylvania': 13002700})

data=pd.DataFrame({'area':area,'pop':pop})
data

Unnamed: 0,area,pop
California,423967,39538223
Texas,695662,29145505
Florida,170312,21538187
New York,141297,20201249
Pennsylvania,119280,13002700


In [5]:
#the individual Series that make up the columns of the DataFrame can be accessed via dictionary-style indexing of the column name:
data['area']

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
Name: area, dtype: int64

In [6]:
#equivalently, we can use attribute-style access with column names that are strings:
data.area

#although this column shorthand is useful, it doesn't work for all cases - for example if column name clashes with a DataFrame method or if column names are not strings

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
Name: area, dtype: int64

In [8]:
#like with the series object discussed earlier, we can manipulate/modify the object using dictionary-style syntax, in this case adding a new column

data['density'] = data['pop']/data['area']
data

Unnamed: 0,area,pop,density
California,423967,39538223,93.257784
Texas,695662,29145505,41.896072
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012
Pennsylvania,119280,13002700,109.009893


In [9]:
#to view the DataDrame as an enhanced two-dimensional array, we can examine raw & underling data using the #values attribute:

data.values

array([[4.23967000e+05, 3.95382230e+07, 9.32577842e+01],
       [6.95662000e+05, 2.91455050e+07, 4.18960717e+01],
       [1.70312000e+05, 2.15381870e+07, 1.26463121e+02],
       [1.41297000e+05, 2.02012490e+07, 1.42970120e+02],
       [1.19280000e+05, 1.30027000e+07, 1.09009893e+02]])

In [10]:
#with this picture in mind, many familiar array-like operations can be done on the DataFrame itself. For example, we can transpose the full DatrFrame to swap rows and columns:
data.T

Unnamed: 0,California,Texas,Florida,New York,Pennsylvania
area,423967.0,695662.0,170312.0,141297.0,119280.0
pop,39538220.0,29145500.0,21538190.0,20201250.0,13002700.0
density,93.25778,41.89607,126.4631,142.9701,109.0099


In [13]:
#When it comes to indexing of a DataFrame object however, it's clear the dictionary-style indexing of columns precludes our ability to simply treat is as a NumPy array. In particular, passing a single index to an array accesses a row:
data.values[0] #row
data['area'] #column

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
Name: area, dtype: int64

In [15]:
#Thus, for array-style indexing, we need another convention. Here, Pandas again uses the loc and iloc indexers mentioned earlier. Using the iloc indexer, we can index the underlying array as if it were a simple NumPy array (using the implicit Python-style index), but the DataFrame index and column labels are maintained in the result:

data.iloc[:3,:2]

Unnamed: 0,area,pop
California,423967,39538223
Texas,695662,29145505
Florida,170312,21538187


In [16]:
#similarly, using the loc indexer we can index the underlying data in an array-like style but using the explicit index and column names
data.loc[:'Florida',:'pop']

Unnamed: 0,area,pop
California,423967,39538223
Texas,695662,29145505
Florida,170312,21538187


In [17]:
#Any o fthese indexing conventions may be also used to set or modify values; this is done in the standard way that you. might be accustomed to from working with NumPy:
data.iloc[0,2]=90
data

Unnamed: 0,area,pop,density
California,423967,39538223,90.0
Texas,695662,29145505,41.896072
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012
Pennsylvania,119280,13002700,109.009893


In [19]:
#Additional indexing conventions:

#while indexing refers to columns, slicing refers to rows:
data['Florida':'New York']

Unnamed: 0,area,pop,density
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012


In [20]:
#Such slices can also refer to rows by number rather than by index:
data[1:3]

Unnamed: 0,area,pop,density
Texas,695662,29145505,41.896072
Florida,170312,21538187,126.463121


In [22]:
#Similarly, direct masking operatings are interpreted row-wise rather than column-wise:
data[data.density > 120]

Unnamed: 0,area,pop,density
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012


In [25]:
#Operating on Data in Pandas:
import numpy as np
import pandas as pd
#UFuncs: index preservation

rng = np.random.default_rng(42)
ser = pd.Series(rng.integers(0,10,4))
ser

0    0
1    7
2    6
3    4
dtype: int64

In [27]:
df = pd.DataFrame(rng.integers(0, 10, (3, 4)),
                  columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,4,8,0,6
1,2,0,5,9
2,7,7,7,7


In [28]:
#if we apply a NumPy ufunc on either of these objects, the results will be another Pandas object with the indices preserved:
np.exp(ser)

0       1.000000
1    1096.633158
2     403.428793
3      54.598150
dtype: float64

In [29]:
#Index alignment in Series:
#Suppose we are combining two different data sources and wish to find only the top three US states by area and top 3 by population:

area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
population = pd.Series({'California': 39538223, 'Texas': 29145505,
                        'Florida': 21538187}, name='population')

In [30]:
#let's see what happens when we divide these to compute the population density:
population/area

Alaska              NaN
California    93.257784
Florida             NaN
Texas         41.896072
dtype: float64

In [31]:
#resulting array contains the union of indices of the two input arrays:
area.index.union(population.index)

Index(['Alaska', 'California', 'Florida', 'Texas'], dtype='object')

In [34]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [37]:
#if we want to replace the NaNs:
A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

The following table lists Python operators and their equivalent Pandas object methods:

| Python operator | Pandas method(s)                |
|-----------------|---------------------------------|
| `+`             | `add`                           |
| `-`             | `sub`, `subtract`               |
| `*`             | `mul`, `multiply`               |
| `/`             | `truediv`, `div`, `divide`      |
| `//`            | `floordiv`                      |
| `%`             | `mod`                           |
| `**`            | `pow`                           |


## Operating on Null Values

As we have seen, Pandas treats `None`, `NaN`, and `NA` as essentially interchangeable for indicating missing or null values.
To facilitate this convention, Pandas provides several methods for detecting, removing, and replacing null values in Pandas data structures.
They are:

- ``isnull``: Generates a Boolean mask indicating missing values
- ``notnull``: Opposite of ``isnull``
- ``dropna``: Returns a filtered version of the data
- ``fillna``: Returns a copy of the data with missing values filled or imputed

We will conclude this chapter with a brief exploration and demonstration of these routines.

In [41]:
#Detecting Null values:

data =pd.Series([1,np.nan,'hello',None])


In [44]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [48]:
#boolean mask that only returns the non-null data

data[data.notnull()]

0        1
2    hello
dtype: object

In [49]:
#dropping null values:
data.dropna()

0        1
2    hello
dtype: object

In [50]:
#for a dataframe, there are more options:

df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [52]:
#IMPORTANT: By default, dropna for pandas will drop all rows in which any null value is present:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [54]:
#alternitively, you can drop NA values along a different axis. Using axis=1 or axis='columns' drops all columns containing a null value
df.dropna(axis='columns')

Unnamed: 0,2
0,2
1,5
2,6


In [56]:
#but this drops some good data as well: you might rather be interested in dropping rows or columns with all NA values, or a majority of NA values: this can be specified trough the how or thresh parameters, which allow fine control of the number of nulls to allow through.

#The default is how='any', such that any row or column containing a null value will be dropped. You can also specify how='all', which will only drop rows/columns that contain all null values. 
df[3]=np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [57]:
df.dropna(axis='columns',how='all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [59]:
#for finer-grained control, the thresh parameter lets you specify a minimum. number of non-null values for the row/column to be kept:
df.dropna(axis='rows',thresh=3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


In [60]:
#Filling Null Values:
#sometimes rather than dropping NA values, you'd like to replace them with valid value. 

data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'),dtype='Int32')

In [61]:
#you an fill NA entires with a single value, such as 0:
data.fillna(0)

a    1
b    0
c    2
d    0
e    3
dtype: Int32

In [66]:
#or, we can specify a forward fill to propogate the previou value forward:
#forward fill:
data.fillna(method='ffill')
data.fillna(method='bfill')

  data.fillna(method='ffill')
  data.fillna(method='bfill')


a    1
b    2
c    2
d    3
e    3
dtype: Int32

In [78]:
# Multiple Index Series:
#how we might represent two-dimensional data within a one-dimensional Series:
#for concreteness, we will consider a series of data where each point has a chatacter and a numerical key:

#the bad way:
index = [('California', 2010), ('California', 2020),
         ('New York', 2010), ('New York', 2020),
         ('Texas', 2010), ('Texas', 2020)]
populations = [37253956, 39538223,
               19378102, 20201249,
               25145561, 29145505]
pop = pd.Series(populations,index=index)
pop

(California, 2010)    37253956
(California, 2020)    39538223
(New York, 2010)      19378102
(New York, 2020)      20201249
(Texas, 2010)         25145561
(Texas, 2020)         29145505
dtype: int64

In [68]:
#With this indexing scheme, you can straightforwardly index or slice the series based on this tuple index:
pop[('California', 2020):('Texas', 2010)]

(California, 2020)    39538223
(New York, 2010)      19378102
(New York, 2020)      20201249
(Texas, 2010)         25145561
dtype: int64

In [71]:
#but the convenience ends there. For example, if you need to select all values from 2010, you'll need to do some messy munging to make it happen:
pop[[1 for i in pop.index if i[1] == 2010]]

#run through row index items in pop and return if position 1 (aka, position 2 equals 2010)

  pop[[1 for i in pop.index if i[1] == 2010]]


(California, 2020)    39538223
(California, 2020)    39538223
(California, 2020)    39538223
dtype: int64

In [82]:
#the better way: the pandas multi-index:
#fortunately, Pandas provides a better way: Pandas MultiIndex type gives us the type of operations we wish to have.
index = pd.MultiIndex.from_tuples(index)

In [83]:
#if we re-index our series with multiindex, we see the hierarchical representation of the data:

pop = pop.reindex(index)
pop

California  2010    37253956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64

In [84]:
#Now, access to all data for which the second index is 2020, we can use the Pandas slicing notation:
pop[:,2020]

California    39538223
New York      20201249
Texas         29145505
dtype: int64

In [85]:
#Unstack method will quickly convert a multiple indexed Series into a conventionally indexed DataFrame

pop_df = pop.unstack()
pop_df

Unnamed: 0,2010,2020
California,37253956,39538223
New York,19378102,20201249
Texas,25145561,29145505


In [86]:
#the stack method provides the opposite operation
pop_df.stack()

California  2010    37253956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64

In [87]:
#Concretely, we might want to add another column of demographic data for each state at each year (say, population under 18); with a `MultiIndex` this is as easy as adding another column to the ``DataFrame``:

pop_df = pd.DataFrame({'total': pop,
                       'under18': [9284094, 8898092,
                                   4318033, 4181528,
                                   6879014, 7432474]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2010,37253956,9284094
California,2020,39538223,8898092
New York,2010,19378102,4318033
New York,2020,20201249,4181528
Texas,2010,25145561,6879014
Texas,2020,29145505,7432474


In [90]:
#ufuncs work with multi-hierarchial as well:

f_u18 = pop_df['under18']/pop_df['total']
f_u18.unstack()

Unnamed: 0,2010,2020
California,0.249211,0.22505
New York,0.222831,0.206994
Texas,0.273568,0.255013


In [94]:
#Methods of multi-index creation:

#the most straightforward way to construct a multiply indexed Series of DF is to simply pass a list of two or more index arrays into the constuctor.

df = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=['data1', 'data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.802494,0.198076
a,2,0.743904,0.10767
b,1,0.256644,0.256825
b,2,0.130592,0.552531


In [95]:
data = {('California', 2010): 37253956,
        ('California', 2020): 39538223,
        ('New York', 2010): 19378102,
        ('New York', 2020): 20201249,
        ('Texas', 2010): 25145561,
        ('Texas', 2020): 29145505}
pd.Series(data)

California  2010    37253956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64

In [96]:
#above were implicit multiindex constructors. For more flexibility in how the index is constructed, use methods available in pd.MultiIndex class. 

#for arrays:
pd.MultiIndex.from_arrays([['a','a','b','b'],[1,2,1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [97]:
#or, you can construct from a list of tuples giving the multiple index values of each point:
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [100]:
#you can even construct from a Cartesian product of single indeces:
pd.MultiIndex.from_product([['a','b'],[1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [102]:
#Multiindex level names: sometimes convenient to name levels, can be accomplished passing the names argument to any.  othe previously discussed MultiIndex constructos

pop.index.names =  ['state','year']
pop

state       year
California  2010    37253956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64

In [105]:
#MultiIndex for columns: In a dataframe, the rows and columns are symmetric but the columns can  have multiple levels as well. Consider this mockup of some (somwhat realistic) medical data:

index = pd.MultiIndex.from_product([[2013,2014],[1,2]],
                                names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob','Guido','Sue'],['HR','Temp']],
                                        names=['subject','type'])

#mock some data:
data = np.round(np.random.randn(4,6), 1)
data[:,::2] *= 10
data += 37

health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,49.0,37.7,36.0,37.6,52.0,36.6
2013,2,51.0,38.0,37.0,37.7,43.0,37.3
2014,1,38.0,36.2,22.0,38.1,45.0,37.8
2014,2,35.0,37.9,49.0,36.9,44.0,37.4


In [106]:
#with this in plac we can, for example, index the top-level column by the person's name and get a full DataFrame containing just that person's information:
health_data['Guido']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,36.0,37.6
2013,2,37.0,37.7
2014,1,22.0,38.1
2014,2,49.0,36.9


In [110]:
#Indexing & Slicing a Multiindex:

#We can access single elements by indexing with multiple terms:

pop['California',2010]

pop['California'] #also support partial indexing, indexing just a level in the index. The result is a series

year
2010    37253956
2020    39538223
dtype: int64

In [111]:
#partial slicing is available as well: as long as the MultiIndex is sorted:
pop.loc['California':'New York']

state       year
California  2010    37253956
            2020    39538223
New York    2010    19378102
            2020    20201249
dtype: int64

In [112]:
#With sorted indices, partial indexing can be performed on lower levels by passing an empty slice in the first index:
pop[:, 2010]

state
California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [113]:
#Other types of indexing and selection work as well, for example, selection based on Boolean masks:
pop[pop>22000000]

state       year
California  2010    37253956
            2020    39538223
Texas       2010    25145561
            2020    29145505
dtype: int64

In [114]:
#selection based on fancy indexing also works:
pop[['California','Texas']]

state       year
California  2010    37253956
            2020    39538223
Texas       2010    25145561
            2020    29145505
dtype: int64

In [118]:
#Multiply Indexed DataFra
#remembr that columns are primary in a datarame, and that syntax used for multiply indexed series applies to columns. We can reover Guido's heart rate data with a simple operation:
health_data['Guido','HR']

year  visit
2013  1        36.0
      2        37.0
2014  1        22.0
      2        49.0
Name: (Guido, HR), dtype: float64

In [119]:
#Concatting:

#signature in pandas:

pd.concat(objs, axis=0, join='outer', ignore_index=False, keys=None,
          levels=None, names=None, verify_integrity=False,
          sort=False, copy=True)

NameError: name 'objs' is not defined

In [120]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [121]:
#and on df:
df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
display('df1', 'df2', 'pd.concat([df1, df2])')

NameError: name 'make_df' is not defined

In [122]:
#doing ignore_index = True will just  add index placement one top when merged

NameError: name 'make_df' is not defined

Delivering figures confidently:
#classification model: you worked on text2bql
#intent classifier that I annoted gets inputted as training data for text2bql, you understand that there is a prompt and associated sub asset class & workflow that then gets fed in and classifier can determine whether it is or is not (classified appropriately because it has that training data) that's a bloomberg experience - here's what I learned in classes 

#span annotation to identify entities - we look at the spans that are picked up and resolved by NED (indices, equities, fields)

In [None]:
sampling: random sampling when stratified sampling : how do you determine the stratas in your sample 
*biases, loans or credit cards: whether they should or should not gice ppl credit card or loans (biases/stratas is age for example - ppl that are younger make more financially irresponsible decisions)

ML: confusion matrix, accuracy, recall, precision, f1 
email spam filter: chatgpt this example 


balance of precision and recall to not miss emails but not get garbage 

Tell me about the firms' strategy for data quality, tell.  mabout dimensions of quality that are important and how it impacted a rpoject
#i took on this project, I had to timeliness, accuracy, completion . Solved that problem by breaking up the project into multipl projects. Anything we needed for reporting really fast, we have in the first project. Building out Gold BQL queries. Accuracy is more important for that step. 

#the math part of this interview: 
#calculate precision & recall
#statistics: define different kinds of sampling
#how would you determine size of each strata? proportional to the population
#how do you determine size of your sample: valid answer: what is statistically significant & what do we have the time or resources to do?
#what is statistically significant (correct): in practice, what is time resources, can we create data synthetically?

#median mean of disco balls off of one production line & the other production line: and then it says the sampling method used to check quality produced this distribution. It was bimodal. For that reason, it had to have been a stratified sample. bc you sampled proportionally from each. *check this*
#similar to the credit card stratification one
#confidence intervals
n+/- 1% confident 

#classification case study 

#also look up query enrichment definition


In [132]:
#PANDAS MERGE EXAMPLE:

pop = pd.read_csv('state-population.csv')
areas = pd.read_csv('state-areas.csv')
abbrevs = pd.read_csv('state-abbrevs.csv')

display(pop.head(),areas.head(), abbrevs.head())


Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489.0
1,AL,total,2012,4817528.0
2,AL,under18,2010,1130966.0
3,AL,total,2010,4785570.0
4,AL,under18,2011,1125763.0


Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707


Unnamed: 0,state,abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


In [133]:
#Rank US states and terriroties by their 2010 population density. We have the data here to find this result, but we'll have ot combine the datasets to do so


In [138]:
#start with many-to-one merge that will give us full state names within the population DataFrame. We want to merge based on the state/region column of pop and the abbreviation column of abbrevs. We'll useg how='outer', ot make sure no data is thrown away due to mismatched labels

merged = pd.merge(pop, abbrevs, how='outer',
                  left_on='state/region', right_on='abbreviation')
merged = merged.drop('abbreviation', axis=1)
merged.head()

Unnamed: 0,state/region,ages,year,population,state
0,AK,total,1990,553290.0,Alaska
1,AK,under18,1990,177502.0,Alaska
2,AK,total,1992,588736.0,Alaska
3,AK,under18,1991,182180.0,Alaska
4,AK,under18,1992,184878.0,Alaska


In [139]:
#lets double-check whether there were any mismatches here, which we can do by looking for rows with nulls:
merged.isnull().any()

state/region    False
ages            False
year            False
population       True
state            True
dtype: bool

In [140]:
#some of the population values are null; let's figure out which these are!
merged[merged['population'].isnull()].head()

Unnamed: 0,state/region,ages,year,population,state
1872,PR,under18,1990,,
1873,PR,total,1990,,
1874,PR,total,1991,,
1875,PR,under18,1991,,
1876,PR,total,1993,,


In [141]:
merged.loc[merged['state'].isnull(),'state/region'].unique()

array(['PR', 'USA'], dtype=object)

In [143]:
merged.loc[merged['state/region'] == 'PR','state'] = 'Puerto Rico'
merged.loc[merged['state/region'] == 'USA','state'] = "United States"
merged.isnull().any()

state/region    False
ages            False
year            False
population       True
state           False
dtype: bool

In [148]:
final = pd.merge(merged,areas,on='state',how='left')
final.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AK,total,1990,553290.0,Alaska,656425.0
1,AK,under18,1990,177502.0,Alaska,656425.0
2,AK,total,1992,588736.0,Alaska,656425.0
3,AK,under18,1991,182180.0,Alaska,656425.0
4,AK,under18,1992,184878.0,Alaska,656425.0


In [150]:
final.isnull().any()

state/region     False
ages             False
year             False
population        True
state            False
area (sq. mi)     True
dtype: bool

In [153]:
final['state'][final['area (sq. mi)'].isnull()].unique()

array(['United States'], dtype=object)

In [161]:
final.dropna(inplace=True)
final.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AK,total,1990,553290.0,Alaska,656425.0
1,AK,under18,1990,177502.0,Alaska,656425.0
2,AK,total,1992,588736.0,Alaska,656425.0
3,AK,under18,1991,182180.0,Alaska,656425.0
4,AK,under18,1992,184878.0,Alaska,656425.0


In [162]:
data2010=final.query("year == 2010 & ages == 'total'")
data2010.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
43,AK,total,2010,713868.0,Alaska,656425.0
51,AL,total,2010,4785570.0,Alabama,52423.0
141,AR,total,2010,2922280.0,Arkansas,53182.0
149,AZ,total,2010,6408790.0,Arizona,114006.0
197,CA,total,2010,37333601.0,California,163707.0


In [165]:
#Now lets compute the population density and display it in order: re-index the data on the state & then compute the results:
#data2010.set_index('state',inplace=True)
data2010
density=data2010['population']/ data2010['area (sq. mi)']

In [166]:
density.sort_values(ascending=False,inplace=True)
density.head()

state
District of Columbia    8898.897059
Puerto Rico             1058.665149
New Jersey              1009.253268
Rhode Island             681.339159
Connecticut              645.600649
dtype: float64

In [167]:
#the result is a ranking of US states, plus Washington DC and Puerto Rico, in order of their 2010 population density, in residents per square mile. 


In [None]:
#Aggregation/Grouping:

#df.describe()
#df.dropna().describe()


The following table summarizes some other built-in Pandas aggregations:

| Aggregation              | Returns                         |
|--------------------------|---------------------------------|
| ``count``                | Total number of items           |
| ``first``, ``last``      | First and last item             |
| ``mean``, ``median``     | Mean and median                 |
| ``min``, ``max``         | Minimum and maximum             |
| ``std``, ``var``         | Standard deviation and variance |
| ``mad``                  | Mean absolute deviation         |
| ``prod``                 | Product of all items            |
| ``sum``                  | Sum of all items                |

These are all methods of `DataFrame` and `Series` objects.

#these simple aggregates are used for just column & row operations. If you want specific, we'll have to use groupby

In [168]:
#split, apply, combine
#split steps involves breaking.  uand grouping a DataFrame depending on the valu of the specified key
#the apply step involves computing some function, usually and aggregate, transformation or filtering, wihtin the individual groups
#the combine step merges the results of these operations into an output array

df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data': range(6)}, columns=['key', 'data'])
df


Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [170]:
df.groupby('key').sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7


In [171]:
#aggregate, filter, transform, apply

rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                   columns = ['key', 'data1', 'data2'])
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


In [173]:
#### Aggregation

#You're now familiar with `GroupBy` aggregations with `sum`, `median`, and the like, but the `aggregate` method allows for even more flexibility.
#It can take a string, a function, or a list thereof, and compute all the aggregates at once.
#Here is a quick example combining all of these:
df.groupby('key').aggregate(['min',np.median,max])

  df.groupby('key').aggregate(['min',np.median,max])
  df.groupby('key').aggregate(['min',np.median,max])


Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,0,1.5,3,3,4.0,5
B,1,2.5,4,0,3.5,7
C,2,3.5,5,3,6.0,9


In [174]:
#Filtering: allows you to drop data based on the group properties. For ex, we might want to keep all groups in which the SD is larger than some critical value:

def filter_func(x):
    return x['data2'].std() > 4

display('df', "df.groupby('key').std()",
        "df.groupby('key').filter(filter_func)")

'df'

"df.groupby('key').std()"

"df.groupby('key').filter(filter_func)"

In [175]:
L = [0, 1, 0, 1, 2, 0]
df.groupby(L).sum()

Unnamed: 0,key,data1,data2
0,ACC,7,17
1,BA,4,3
2,B,4,7


In [176]:
df2 = df.set_index('key')
mapping = {'A':'vowel','B':'consonant','C':'consonant'}
display('df2','df2.groupby(mapping).sum()')

'df2'

'df2.groupby(mapping).sum()'

In [None]:
#Grouping example:
decade = 10 *(planets['year'] // 10)

NameError: name 'planets' is not defined

In [178]:
#pivot tables

import numpy as np
import pandas as pd
import seaborn as sns
titanic = sns.load_dataset('titanic')


In [179]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [182]:
titanic.groupby('sex')[['survived']].mean()

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


In [183]:
titanic.groupby('sex')[['survived']].mean()

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


In [185]:
  titanic.groupby(['sex','class'])['survived'].aggregate('mean').unstack()


  titanic.groupby(['sex','class'])['survived'].aggregate('mean').unstack()


class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [187]:
titanic.pivot_table('survived', index='sex',columns='class',aggfunc='mean')

  titanic.pivot_table('survived', index='sex',columns='class',aggfunc='mean')


class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [189]:
age = pd.cut(titanic['age'], [0,18,80])
titanic.pivot_table('survived',['sex',age],'class').unstack()

  titanic.pivot_table('survived',['sex',age],'class').unstack()


class,First,First,Second,Second,Third,Third
age,"(0, 18]","(18, 80]","(0, 18]","(18, 80]","(0, 18]","(18, 80]"
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
female,0.909091,0.972973,1.0,0.9,0.511628,0.423729
male,0.8,0.375,0.6,0.071429,0.215686,0.133663


### Additional Pivot Table Options

The full call signature of the `DataFrame.pivot_table` method is as follows:

```python
# call signature as of Pandas 1.3.5
DataFrame.pivot_table(data, values=None, index=None, columns=None,
                      aggfunc='mean', fill_value=None, margins=False,
                      dropna=True, margins_name='All', observed=False,
                      sort=True)
```

We've already seen examples of the first three arguments; here we'll take a quick look at some of the remaining ones.
Two of the options, `fill_value` and `dropna`, have to do with missing data and are fairly straightforward; I will not show examples of them here.

The `aggfunc` keyword controls what type of aggregation is applied, which is a mean by default.
As with `groupby`, the aggregation specification can be a string representing one of several common choices (`'sum'`, `'mean'`, `'count'`, `'min'`, `'max'`, etc.) or a function that implements an aggregation (e.g., `np.sum()`, `min()`, `sum()`, etc.).
Additionally, it can be specified as a dictionary mapping a column to any of the desired options:

In [193]:
#Example: Birthrate data 
url = "https://raw.githubusercontent.com/jakevdp/data-CDCbirths/master/births.csv"
births = pd.read_csv(url)

births.head()

Unnamed: 0,year,month,day,gender,births
0,1969,1,1.0,F,4046
1,1969,1,1.0,M,4440
2,1969,1,2.0,F,4454
3,1969,1,2.0,M,4548
4,1969,1,3.0,F,4548


In [None]:
births['decade'] = 10 * (births['year'] // 10)
births.pivot_table('births',index='decade',columns='gender',aggfunc='sum')

#births as values, female/male as columns, and decade as index (key)

gender,F,M
decade,Unnamed: 1_level_1,Unnamed: 2_level_1
1960,1753634,1846572
1970,16263075,17121550
1980,18310351,19243452
1990,19479454,20420553
2000,18229309,19106428


In [None]:
import matplotlib.pyplot as plt

In [201]:
plt.style.use('seaborn-whitegrid')
births.pivot_table(
    'births', index='year', columns='gender', aggfunc='sum').plot()
plt.ylabel('total births per year');

OSError: 'seaborn-whitegrid' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)

In [202]:
x = np.array([2, 3, 5, 7, 11, 13])
x * 2

array([ 4,  6, 10, 14, 22, 26])

In [203]:
data = ['peter', 'Paul', 'MARY', 'gUIDO']
[s.capitalize() for s in data]

['Peter', 'Paul', 'Mary', 'Guido']

In [204]:
data = ['peter', 'Paul', None, 'MARY', 'gUIDO']
[s if s is None else s.capitalize() for s in data]

['Peter', 'Paul', None, 'Mary', 'Guido']

In [205]:
#pandas includes features to address both this need for vectorized string operations and the need for correctly handling missing data via the str attribute of Pandas Series and Index objects containing strings. So, for example, if we create a Pandas Series with this data we can irectly call the str.capitalize method, which has missing value handling build in
names=pd.Series(data)
names.str.capitalize()

0    Peter
1     Paul
2     None
3     Mary
4    Guido
dtype: object

In [206]:
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
                   'Eric Idle', 'Terry Jones', 'Michael Palin'])

### Methods Similar to Python String Methods

Nearly all of Python's built-in string methods are mirrored by a Pandas vectorized string method. Here is a list of Pandas `str` methods that mirror Python string methods:

|           |                |                |                |
|-----------|----------------|----------------|----------------|
|`len()`    | `lower()`      | `translate()`  | `islower()`    |
|`ljust()`  | `upper()`      | `startswith()` | `isupper()`    |
|`rjust()`  | `find()`       | `endswith()`   | `isnumeric()`  |
|`center()` | `rfind()`      | `isalnum()`    | `isdecimal()`  |
|`zfill()`  | `index()`      | `isalpha()`    | `split()`      |
|`strip()`  | `rindex()`     | `isdigit()`    | `rsplit()`     |
|`rstrip()` | `capitalize()` | `isspace()`    | `partition()`  |
|`lstrip()` | `swapcase()`   | `istitle()`    | `rpartition()` |

Notice that these have various return values. Some, like `lower`, return a series of strings:

In [None]:
monte.str.startswith('T')
monte.str.split("")

0    [Gra, am C, apman]
1        [Jo, n Cleese]
2       [Terry Gilliam]
3           [Eric Idle]
4         [Terry Jones]
5      [Mic, ael Palin]
dtype: object

### Methods Using Regular Expressions

In addition, there are several methods that accept regular expressions (regexps) to examine the content of each string element, and follow some of the API conventions of Python's built-in `re` module:

| Method    | Description |
|-----------|-------------|
| `match`   | Calls `re.match` on each element, returning a Boolean. |
| `extract` | Calls `re.match` on each element, returning matched groups as strings.|
| `findall` | Calls `re.findall` on each element |
| `replace` | Replaces occurrences of pattern with some other string|
| `contains`| Calls `re.search` on each element, returning a boolean |
| `count`   | Counts occurrences of pattern|
| `split`   | Equivalent to `str.split`, but accepts regexps |
| `rsplit`  | Equivalent to `str.rsplit`, but accepts regexps |

In [None]:
monte.str.extract('([A-Za-z]+)',expand=False)

#extracting first name from each element by asking for a continuous group of characters at the beginning of each element

Unnamed: 0,0
0,Graham
1,John
2,Terry
3,Eric
4,Terry
5,Michael


In [216]:
#find all names that start and end with a consonant, making use of the start-of-string (^) and end of string($)

monte.str.findall(r'^[^AEIOU].*[^aeiou]$')

0    [Graham Chapman]
1                  []
2     [Terry Gilliam]
3                  []
4       [Terry Jones]
5     [Michael Palin]
dtype: object

### Miscellaneous Methods
Finally, there are some miscellaneous methods that enable other convenient operations:

| Method | Description |
|--------|-------------|
| `get` | Indexes each element |
| `slice` | Slices each element|
| `slice_replace` | Replaces slice in each element with the passed value|
| `cat`      | Concatenates strings|
| `repeat` | Repeats values |
| `normalize` | Returns Unicode form of strings |
| `pad` | Adds whitespace to left, right, or both sides of strings|
| `wrap` | Splits long strings into lines with length less than a given width|
| `join` | Joins strings in each element of the `Series` with the passed separator|
| `get_dummies` | Extracts dummy variables as a `DataFrame` |

In [218]:
#vectorized item access and slicing: get and slice operations, 

monte.str[0:3]

0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object

In [None]:
#indexing via df.str.get(i) and df.str[i] are likewise wimilar. Indexing methods let you access elements.  oarrays returns by split. 

In [None]:
monte.str.split().str[1]

#takes first element of split (in this case, only first name)
#But we can specify here what to split by

SyntaxError: invalid syntax (796543497.py, line 1)

In [None]:
from datetime import datetime
datetime(year=2021, month=7,day=4)

In [None]:
date.strftime('%A')