In [7]:
import pandas
pandas.__version__

'2.2.1'

In [49]:
import numpy as np
import pandas as pd


## The Three Fundamental Data Structures of Pandas
1. Series
2. DataFrame
3. Index

### Series Object
- Is a one dimensional array of indexed data. It can be created from a list or array
- Wraps a sequence of values and sequence of indices

In [33]:
import pandas as pd
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data
print(data.values) # Is simply a NumPy array
print(data.index)
print(data[1])
print(data[1:3])

[0.25 0.5  0.75 1.  ]
RangeIndex(start=0, stop=4, step=1)
0.5
1    0.50
2    0.75
dtype: float64


In [34]:
# changing the index
data = pd.Series([0.25, 0.50, 0.75, 1.0],
                 index=['a','b','c','d'])
print(data)
print(data['c'])

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
0.75


- Pandas Series are like a specialized dictionary
- unlike dictionaries we can use data indexing and slicing in a series

In [35]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [36]:
names_dict = {'Jdub' : 25,
              'Zach' : 23,
              'Colin' : 22,
              'Micheal' : 25}
name = pd.Series(names_dict)
print(name)
print(f"\n{name['Zach':'Micheal']}")

Jdub       25
Zach       23
Colin      22
Micheal    25
dtype: int64

Zach       23
Colin      22
Micheal    25
dtype: int64


In [37]:
pd.Series(7, index=[100, 200, 300])

100    7
200    7
300    7
dtype: int64

In [38]:
pd.Series({2:'a', 1:'b', 3:'c'})

2    a
1    b
3    c
dtype: object

In [39]:
pd.Series({2:'a', 1:'b', 3:'c'}, index=[3,2]) #Series is populated only with what is explicitly identified

3    c
2    a
dtype: object

### DataFrame

- An analog of a two-dimensional array with flexible row indices and flexible column names

In [40]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [41]:
states = pd.DataFrame({'population': population,
                       'area': area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [42]:
states.columns

Index(['population', 'area'], dtype='object')

In [44]:
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [46]:
print(states['area'])
print(f"\n{states['population']}")

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: population, dtype: int64


In [47]:
pd.DataFrame(population, columns=['numbers'])

Unnamed: 0,numbers
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [48]:
data = [{'a':i, 'b':2*i}
        for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


### Indexing

## Data Indexing and Selection

## Operating on Data in Pandas

In [15]:
import pandas as pd
import numpy as np

rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0,10,4))
print(ser)

0    6
1    3
2    7
3    4
dtype: int32


In [16]:
df = pd.DataFrame(rng.randint(0, 10, (3, 4)),
                  columns=['A','B','C','D'])
print(df)

   A  B  C  D
0  6  9  2  6
1  7  4  3  7
2  7  2  5  4


In [20]:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [23]:
np.sin(df * np.pi / 4)

Unnamed: 0,A,B,C,D
0,-1.0,0.7071068,1.0,-1.0
1,-0.707107,1.224647e-16,0.707107,-0.7071068
2,-0.707107,1.0,-0.707107,1.224647e-16


In [33]:
area = pd.Series({'Alaska':172337, 'Texas': 695662, 
                  'California': 429967}, name='area')
population = pd.Series({'California':38332521, 'Texas': 26448193,
                        'New York': 19651127}, name='population')

In [34]:
population / area

Alaska             NaN
California    89.15224
New York           NaN
Texas         38.01874
dtype: float64

In [36]:
area.index 

Index(['Alaska', 'Texas', 'California'], dtype='object')

In [37]:
population.index

Index(['California', 'Texas', 'New York'], dtype='object')

In [40]:
A = pd.Series([2,4,6], index=[0,1,2])
B = pd.Series([1,3,5], index=[1,2,3])
A+B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [41]:
A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

In [44]:
A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                 columns=list('AB'))
A

Unnamed: 0,A,B
0,1,11
1,5,1


In [45]:
B = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                 columns=list('BAC'))
B

Unnamed: 0,B,A,C
0,4,0,9
1,5,8,0
2,9,2,6


In [46]:
A + B

Unnamed: 0,A,B,C
0,1.0,15.0,
1,13.0,6.0,
2,,,


In [47]:
fill = A.stack().mean()
A.add(B, fill_value=fill)

Unnamed: 0,A,B,C
0,1.0,15.0,13.5
1,13.0,6.0,4.5
2,6.5,13.5,10.5


In [53]:
A = rng.randint(10, size=(3, 4))
A

array([[1, 9, 8, 9],
       [4, 1, 3, 6],
       [7, 2, 0, 3]])

In [54]:
A - A[0]

array([[ 0,  0,  0,  0],
       [ 3, -8, -5, -3],
       [ 6, -7, -8, -6]])

In [56]:
df = pd.DataFrame(A, columns=list('QRST'))
df - df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,3,-8,-5,-3
2,6,-7,-8,-6


## Handling Missing Data

In [57]:
data = pd.Series([1, np.nan, 'hello', None])
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [58]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [59]:
data.dropna()

0        1
2    hello
dtype: object

In [61]:
df = pd.DataFrame([[1,       np.nan, 2],
                   [2,       3,      5],
                   [np.nan,  4,      6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [62]:
# .dropna() will only drop for a full row or full column not single values for a Dataframe
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [63]:
df.dropna(axis='columns')

Unnamed: 0,2
0,2
1,5
2,6


In [64]:
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [65]:
df.dropna(axis='columns', how='all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [66]:
df.dropna(axis='rows', thresh=3)


Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


In [67]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [68]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [75]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [71]:
df.fillna(method='ffill', axis=1)

  df.fillna(method='ffill', axis=1)


Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


In [73]:
df.ffill(axis=1)

Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


In [77]:
df.bfill(axis=1)

Unnamed: 0,0,1,2,3
0,1.0,2.0,2.0,
1,2.0,3.0,5.0,
2,4.0,4.0,6.0,


## Hierarchical Indexing