In [1]:
import numpy as np
import pandas as pd
print('NumPy version:  ',np.__version__)
print('Pandas version: ',pd.__version__)

NumPy version:   1.26.4
Pandas version:  2.2.1


## Data Indexing and Selection

### Data Selection in Series

In [2]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [3]:
data['b']

0.5

In [4]:
'c' in data

True

In [5]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [6]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [7]:
data.shape

(4,)

In [8]:
data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [9]:
# slicing by explicit index
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [10]:
# slicing by implicit index
data[1:4]

b    0.50
c    0.75
d    1.00
dtype: float64

When you are slicing with an explicit index, the final index is included in the slice, while when you’re slicing with an implicit index, the final index is excluded from the slice.

In [11]:
# masking
data[(data > 0.3) & (data < 0.8)]

b    0.50
c    0.75
dtype: float64

In [12]:
# fancy indexing
data[['a', 'e']]

a    0.25
e    1.25
dtype: float64

In [13]:
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
print(data)

# explicit index when indexing
print('data[1]: ',data[1])

# implicit index when slicing
print('data[1:3]:\n',data[1:3])

1    a
3    b
5    c
dtype: object
data[1]:  a
data[1:3]:
 3    b
5    c
dtype: object


In [14]:
#Indexer: loc (explicit index)
print(data.loc[1])
print(data.loc[1:5])  # Last element added

a
1    a
3    b
5    c
dtype: object


In [15]:
# Indexer: iloc (implicit index)
print(data.iloc[1])
print(data.iloc[1:2])  # Last element not added

b
3    b
dtype: object


### Data Selection in DataFrame

In [16]:
area = pd.Series({'California': 423967, 'Texas': 695662, 
                  'New York': 141297, 'Florida': 170312, 'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193, 
                 'New York': 19651127, 'Florida': 19552860, 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [17]:
data.area is data['area']

True

In [18]:
data.pop is data['pop'] # Because DataFrame has a pop() method

False

In [19]:
data['density'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [20]:
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

In [21]:
data.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [22]:
data.columns

Index(['area', 'pop', 'density'], dtype='object')

In [23]:
data.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967.0,695662.0,141297.0,170312.0,149995.0
pop,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


In [24]:
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [25]:
data.iloc[:3, :2] # Implicit index

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [26]:
data.loc[:'Illinois', :'pop']  # Explicit index

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [27]:
data.loc[data['density']>90,['area','pop']]

Unnamed: 0,area,pop
California,423967,38332521
New York,141297,19651127
Florida,170312,19552860


## Operations on Data in Pandas

In [2]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0,10,4))
df = pd.DataFrame(rng.randint(0,10,(3,4)),
                 columns=['A','B','C','D'])

In [3]:
ser

0    6
1    3
2    7
3    4
dtype: int32

In [4]:
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [7]:
# Ufunc: index preservation

np.sin(df*np.pi/4)

Unnamed: 0,A,B,C,D
0,-1.0,0.7071068,1.0,-1.0
1,-0.707107,1.224647e-16,0.707107,-0.7071068
2,-0.707107,1.0,-0.707107,1.224647e-16


In [6]:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [8]:
# Ufunc: index alignment

area = pd.Series({'Alaska': 1723337, 'Texas': 695662, 'California': 423967}, 
                 name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193, 'New York': 19651127}, 
                       name='population')

In [9]:
population / area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [13]:
area.index.union(population.index)

Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

In [14]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [16]:
# Filling NaN values

A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

In [19]:
A = pd.DataFrame(rng.randint(0,10,(3,3)),
                columns = list('ABC'))
B = pd.DataFrame(rng.randint(0,20,(4,4)),
                columns = list('BACD'))

In [20]:
A

Unnamed: 0,A,B,C
0,6,4,8
1,6,1,3
2,8,1,9


In [21]:
B

Unnamed: 0,B,A,C,D
0,8,1,19,14
1,6,11,7,14
2,2,13,16,3
3,17,7,3,1


In [22]:
A + B

Unnamed: 0,A,B,C,D
0,7.0,12.0,27.0,
1,17.0,7.0,10.0,
2,21.0,3.0,25.0,
3,,,,


In [23]:
fill = A.stack().mean()
fill

5.111111111111111

In [28]:
A.stack()

0  A    6
   B    4
   C    8
1  A    6
   B    1
   C    3
2  A    8
   B    1
   C    9
dtype: int32

In [29]:
A.add(B, fill_value = fill)

Unnamed: 0,A,B,C,D
0,7.0,12.0,27.0,19.111111
1,17.0,7.0,10.0,19.111111
2,21.0,3.0,25.0,8.111111
3,12.111111,22.111111,8.111111,6.111111


In [30]:
A = rng.randint(10, size=(3,4))
A

array([[5, 5, 9, 3],
       [5, 1, 9, 1],
       [9, 3, 7, 6]])

In [31]:
A - A[0]

array([[ 0,  0,  0,  0],
       [ 0, -4,  0, -2],
       [ 4, -2, -2,  3]])

In [32]:
df = pd.DataFrame(A, columns=list('QRST'))
df

Unnamed: 0,Q,R,S,T
0,5,5,9,3
1,5,1,9,1
2,9,3,7,6


In [33]:
df - df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,0,-4,0,-2
2,4,-2,-2,3


In [34]:
df.subtract(df['R'], axis = 0)

Unnamed: 0,Q,R,S,T
0,0,0,4,-2
1,4,0,8,0
2,6,0,4,3


In [36]:
halfrow = df.iloc[0,::2]
halfrow

Q    5
S    9
Name: 0, dtype: int32

In [37]:
df - halfrow

Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,0.0,,0.0,
2,4.0,,-2.0,


## Handling Missing Data

Pandas uses sentinels for missing data. Pandas chose to use two already-existing Python null values:
1. **None**: Pythonic missing data
2. **NaN**: floating-point missing value

In [38]:
# None can only be used in arrays with data type 'object'

vals1 = np.array([1, None, 2, 4])
vals1

array([1, None, 2, 4], dtype=object)

In [39]:
for dtype in ['object', 'int']:
    print("dtype =", dtype)
    %timeit np.arange(1E6, dtype=dtype).sum()
    print()

dtype = object
155 ms ± 8.18 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

dtype = int
4.38 ms ± 186 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)



In [40]:
vals1.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [42]:
vals2 = np.array([1, np.nan, 3,4])
vals2.dtype

dtype('float64')

In [43]:
1 + vals2

array([ 2., nan,  4.,  5.])

In [44]:
0 * vals2

array([ 0., nan,  0.,  0.])

In [45]:
np.sum(vals2), np.max(vals2), np.min(vals2)

(nan, nan, nan)

In [46]:
np.nansum(vals2), np.nanmax(vals2), np.nanmin(vals2)

(8.0, 4.0, 1.0)

In [47]:
pd.Series([1, np.nan, 3, None])

0    1.0
1    NaN
2    3.0
3    NaN
dtype: float64

### Operations on Null values

1. `isnull()` Generate a Boolean mask indicating missing values
2. `notnull()` Opposite of isnull()
3. `dropna()` Return a filtered version of the data
4. `fillna()` Return a copy of the data with missing values filled or imputed

In [48]:
data = pd.Series([1, np.nan, 4, None, 'hello'])
data.isnull()

0    False
1     True
2    False
3     True
4    False
dtype: bool

In [49]:
data[data.notnull()]

0        1
2        4
4    hello
dtype: object

In [50]:
data.dropna()

0        1
2        4
4    hello
dtype: object

In [51]:
df = pd.DataFrame([[1, np.nan, 2],
                   [2, 3, 5],
                   [np.nan, 4, 6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [52]:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [53]:
df.dropna(axis=1)

Unnamed: 0,2
0,2
1,5
2,6


In [54]:
data = pd.Series([1, np.nan, 2, None, 3], index = list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [55]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [56]:
data.fillna(np.nanmean(data))

a    1.0
b    2.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [58]:
# Forward fill
data.ffill()

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [59]:
data.bfill()

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [60]:
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [61]:
df.ffill(axis=1)

Unnamed: 0,0,1,2
0,1.0,1.0,2.0
1,2.0,3.0,5.0
2,,4.0,6.0


In [63]:
df.fillna(np.nanmean(df))

Unnamed: 0,0,1,2
0,1.0,3.285714,2
1,2.0,3.0,5
2,3.285714,4.0,6
