# getting started with pandas

In [1]:
import pandas as pd

In [2]:
from pandas import Series, DataFrame

In [3]:
import numpy as np
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.set_printoptions(precision=4, suppress=True)

## intro to pandas data structure

## Series

In [4]:
obj = pd.Series([4, 8, -5, 2])
obj

0    4
1    8
2   -5
3    2
dtype: int64

In [5]:
# the value attribute
obj.values

array([ 4,  8, -5,  2])

In [6]:
# the index attribute
obj.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
obj2 = pd.Series([3, -2, 6, 88], index=['a', 'b', 'c', 'd'])
obj2

a     3
b    -2
c     6
d    88
dtype: int64

In [8]:
obj2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [19]:
obj2.values

array([ 3, -2,  6, 88])

In [9]:
pd.__version__

'1.0.3'

In [10]:
# get one value by its index
obj2['b']

-2

In [11]:
# get multiple values by their index as a list
obj2[['b', 'd', 'a', 'c']]

b    -2
d    88
a     3
c     6
dtype: int64

In [12]:
# filter by boolean array
obj2[obj2 > 5]

c     6
d    88
dtype: int64

In [13]:
obj2 * 2

a      6
b     -4
c     12
d    176
dtype: int64

In [14]:
np.exp(obj2)

a    2.008554e+01
b    1.353353e-01
c    4.034288e+02
d    1.651636e+38
dtype: float64

In [15]:
# series as a dict
'c' in obj2

True

In [16]:
'k' in obj2

False

In [17]:
# create Series from dict
sdata = {'Ohio': 344, 'Texas': 873, 'Utah': 486}
obj3 = pd.Series(sdata)
obj3

Ohio     344
Texas    873
Utah     486
dtype: int64

In [18]:
# override index
states = ['Cal', 'Ohio', 'NY']
obj4 = pd.Series(sdata, index=states)
obj4

Cal       NaN
Ohio    344.0
NY        NaN
dtype: float64

In [20]:
pd.isnull(obj4)

Cal      True
Ohio    False
NY       True
dtype: bool

In [23]:
pd.isnull(obj4).sum()

2

In [24]:
pd.notnull(obj4).sum()

1

In [25]:
obj4.isnull()

Cal      True
Ohio    False
NY       True
dtype: bool

In [26]:
obj.isnull().sum()

0

In [27]:
obj

0    4
1    8
2   -5
3    2
dtype: int64

In [28]:
obj4

Cal       NaN
Ohio    344.0
NY        NaN
dtype: float64

In [29]:
obj3 + obj4

Cal        NaN
NY         NaN
Ohio     688.0
Texas      NaN
Utah       NaN
dtype: float64

In [30]:
obj4.name = 'population'

In [31]:
obj4.index.name = 'state'

In [32]:
obj4

state
Cal       NaN
Ohio    344.0
NY        NaN
Name: population, dtype: float64

In [33]:
obj

0    4
1    8
2   -5
3    2
dtype: int64

In [35]:
obj.index = ['b', 's', 'j', 'r']
obj

b    4
s    8
j   -5
r    2
dtype: int64