# Import pandas

In [5]:
import numpy as np
import pandas as pd
pd.__version__

'2.2.3'

In [4]:
pd?

# Pandas Series object

In [6]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [7]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [9]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [10]:
data[1]

0.5

In [11]:
data[1:3]

1    0.50
2    0.75
dtype: float64

In [13]:
data = pd.Series(
    [0.25, 0.5, 0.75, 1.0],
    index=['a', 'b', 'c', 'd']
)

data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [14]:
data['b']

0.5

In [18]:
pop_dict = {
    'California': 383,
    'Texas': 264,
    'New York': 196
}

pop = pd.Series(pop_dict)
pop

California    383
Texas         264
New York      196
dtype: int64

In [19]:
pop['California']

383

# Pandas DataFrame object

In [25]:
area_dict = {
    'California': 423,
    'Texas': 695,
    'New York': 141
}

area = pd.Series(area_dict)

states = pd.DataFrame({'population' : pop, 'area': area})

states

Unnamed: 0,population,area
California,383,423
Texas,264,695
New York,196,141


In [23]:
states.index

Index(['California', 'Texas', 'New York'], dtype='object')

In [26]:
states.columns

Index(['population', 'area'], dtype='object')

In [27]:
states['area']

California    423
Texas         695
New York      141
Name: area, dtype: int64

In [29]:
pd.DataFrame(
    np.random.rand(3, 2),
    columns=['foo', 'bar'],
    index=['a', 'b', 'c']
)

Unnamed: 0,foo,bar
a,0.470052,0.118108
b,0.032039,0.515715
c,0.209731,0.207394


# Data indexing and selection
## Series

In [32]:
print(data['b'])
data['b'] = 1.25
print(data)

0.5
a    0.25
b    1.25
c    0.75
d    1.00
dtype: float64
0.5
a    0.25
b    1.25
c    0.75
d    1.00
dtype: float64


In [33]:
data['a':'c']

a    0.25
b    1.25
c    0.75
dtype: float64

In [34]:
data[0:2]

a    0.25
b    1.25
dtype: float64

In [35]:
data[(data>0.3) & (data < 0.8)]

c    0.75
dtype: float64

In [37]:
data[['a', 'c']]

a    0.25
c    0.75
dtype: float64

In [40]:
data.loc['a':'c'] # loc always uses explicit indices

a    0.25
b    1.25
c    0.75
dtype: float64

In [45]:
data.iloc[0:4] # iloc always uses implicit indices

a    0.25
b    1.25
c    0.75
d    1.00
dtype: float64

## DataFrames

In [50]:
states['area']

California    423
Texas         695
New York      141
Name: area, dtype: int64

In [51]:
states.area # equivalent but possibly dangerous

California    423
Texas         695
New York      141
Name: area, dtype: int64

In [56]:
states['density'] = states['population']/states['area']
states

Unnamed: 0,population,area,density
California,383,423,0.905437
Texas,264,695,0.379856
New York,196,141,1.390071


In [58]:
states.values

array([[3.83000000e+02, 4.23000000e+02, 9.05437352e-01],
       [2.64000000e+02, 6.95000000e+02, 3.79856115e-01],
       [1.96000000e+02, 1.41000000e+02, 1.39007092e+00]])

In [59]:
states.T

Unnamed: 0,California,Texas,New York
population,383.0,264.0,196.0
area,423.0,695.0,141.0
density,0.905437,0.379856,1.390071


In [60]:
states.iloc[:3, :2]

Unnamed: 0,population,area
California,383,423
Texas,264,695
New York,196,141


In [61]:
states.loc[:'New York', :'area'] # equivalent with loc indexer

Unnamed: 0,population,area
California,383,423
Texas,264,695
New York,196,141


In [65]:
states.loc[states.density > 0.5, ['population', 'density']]

Unnamed: 0,population,density
California,383,0.905437
New York,196,1.390071


In [69]:
states.iloc[0, 2] = 0.9
states

Unnamed: 0,population,area,density
California,383,423,0.9
Texas,264,695,0.379856
New York,196,141,1.390071
