In [1]:
!pip install pandas



In [None]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

def print_df(df: pd.DataFrame):
    from IPython.core.display import display_html, HTML
    display_html(HTML(df.to_html()))

np.__version__, pd.__version__

('2.1.3', '2.2.3')

# 5.1 Introduction to pandas Data Structures

## Series

In [3]:
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [4]:
obj.array, obj.index

(<NumpyExtensionArray>
 [np.int64(4), np.int64(7), np.int64(-5), np.int64(3)]
 Length: 4, dtype: int64,
 RangeIndex(start=0, stop=4, step=1))

In [None]:
# use labels in index
obj2 = pd.Series([4, 7, -5, 3], index=["d", "b", "a", "c"])
obj2, obj2.index

(d    4
 b    7
 a   -5
 c    3
 dtype: int64,
 Index(['d', 'b', 'a', 'c'], dtype='object'))

In [7]:
obj2["a"]

np.int64(-5)

In [8]:
obj2["d"] = 6
obj2[["c", "a", "d"]]

c    3
a   -5
d    6
dtype: int64

In [9]:
# with NumPy
obj2, obj2[obj2 > 0], obj2 * 2, np.exp(obj2)

(d    6
 b    7
 a   -5
 c    3
 dtype: int64,
 d    6
 b    7
 c    3
 dtype: int64,
 d    12
 b    14
 a   -10
 c     6
 dtype: int64,
 d     403.428793
 b    1096.633158
 a       0.006738
 c      20.085537
 dtype: float64)

In [None]:
# as dict
obj2, "b" in obj2, "e" in obj2

(d    6
 b    7
 a   -5
 c    3
 dtype: int64,
 True,
 False)

In [11]:
sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}
obj3 = pd.Series(sdata)
obj3, obj3.to_dict()

(Ohio      35000
 Texas     71000
 Oregon    16000
 Utah       5000
 dtype: int64,
 {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000})

In [12]:
states = ["California", "Ohio", "Oregon", "Texas"]
obj4 = pd.Series(sdata, index=states) # pass index
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [13]:
# NA
pd.isna(obj4), pd.notna(obj4), obj4.isna()

(California     True
 Ohio          False
 Oregon        False
 Texas         False
 dtype: bool,
 California    False
 Ohio           True
 Oregon         True
 Texas          True
 dtype: bool,
 California     True
 Ohio          False
 Oregon        False
 Texas         False
 dtype: bool)

In [14]:
# align by index label in arithmetic operations
obj3, obj4, obj3 + obj4

(Ohio      35000
 Texas     71000
 Oregon    16000
 Utah       5000
 dtype: int64,
 California        NaN
 Ohio          35000.0
 Oregon        16000.0
 Texas         71000.0
 dtype: float64,
 California         NaN
 Ohio           70000.0
 Oregon         32000.0
 Texas         142000.0
 Utah               NaN
 dtype: float64)

In [15]:
# name attribute
obj4.name = "population"
obj4.index.name = "state"
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [16]:
# assign index
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [17]:
obj.index = ["Bob", "Steve", "Jeff", "Ryan"]
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

## DataFrame

In [18]:
# dict of Python list or NumPy arrsy
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [None]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


: 

## Index Objects

# 5.2 Essential Functionality

# 5.3 Summarizing and Computing Descriptive Statistics