# Pandas Objects

### A Series

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Series is a one-dimensional labeled array capable of holding
# any data type(ex. a Python dict, an ndarray, a scalar value (like 5)

In [3]:
# Series using a dict
mydict = {'one':1, 'two':2, 'three':3}
s_dict = pd.Series(mydict)
s_dict

one      1
three    3
two      2
dtype: int64

In [4]:
# Series using a tuple, note gave letters as labels
mytupl = (1,2,3)
s_tupl = pd.Series(mytupl, index=['a','b','c'])
s_tupl

a    1
b    2
c    3
dtype: int64

In [5]:
# Series using a list, also gave letters as labels
mylist = [1,2,3]
s_mylist = pd.Series(mylist, index=['a','b','c'])
s_mylist

a    1
b    2
c    3
dtype: int64

In [6]:
# Series using an ndarray
s = pd.Series(np.array([1,2,3,4,5,6]), index=['a','b','c','d','e','f'])
s

a    1
b    2
c    3
d    4
e    5
f    6
dtype: int64

In [7]:
# return index
s.index

Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')

In [8]:
# have index automatically assigned by not specifying it
t = pd.Series(np.random.randn(5))
t

0    0.957482
1    1.344401
2    1.514676
3    0.167827
4   -0.750427
dtype: float64

##### Using a dict

In [13]:
# using a dict, the keys become the axis labels
# NOTE, sorted according to keys
d = {'a':0., 'c':1., 'b':2.}
s_fromdict = pd.Series(d)
s_fromdict

a    0.0
b    2.0
c    1.0
dtype: float64

In [14]:
# adding an index manually can force an ordering
# note, index is a list
# note, NaN (Not a Number) added for missing value created by new key
s_fromdict_ordered = pd.Series(d, index=['b', 'c', 'd', 'a'])
s_fromdict_ordered

b    2.0
c    1.0
d    NaN
a    0.0
dtype: float64

##### Series using a scalar

In [15]:
# a scalar is just a fucking number
# scalar is just copied into the number of indexes
s_fromscalar = pd.Series(5., index=['a','b','c','d','e'])
s_fromscalar

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

##### Series is ndarray-like

In [16]:
# my original Series using an ndarray
s

a    1
b    2
c    3
d    4
e    5
f    6
dtype: int64

In [17]:
# access the 0th
s[0]

1

In [18]:
# so I can slice
s[:3]

a    1
b    2
c    3
dtype: int64

In [19]:
# can perform math inside which computes to an index start
# note, s.median() is 3.5
s[s > s.median()]

d    4
e    5
f    6
dtype: int64

In [20]:
# a list denotes indexes to be retrieved
s[[4,3,1]]

e    5
d    4
b    2
dtype: int64

In [21]:
# the exponential function e^x (irrational number to the xth)
np.exp(s)

a      2.718282
b      7.389056
c     20.085537
d     54.598150
e    148.413159
f    403.428793
dtype: float64

#### series is dict-like

In [22]:
# again, from my original Series from an ndarray
s['a']

1

In [23]:
'e' in s

True

In [24]:
'z' in s

False

In [25]:
# nothing output here
s.get('x')

In [26]:
# nan output
s.get('x', np.nan)

nan

##### vectorized operations and label alignment with Series

In [27]:
# my original Series using an ndarray
s

a    1
b    2
c    3
d    4
e    5
f    6
dtype: int64

In [28]:
s + s

a     2
b     4
c     6
d     8
e    10
f    12
dtype: int64

In [29]:
s * 2

a     2
b     4
c     6
d     8
e    10
f    12
dtype: int64

In [30]:
np.exp(s)

a      2.718282
b      7.389056
c     20.085537
d     54.598150
e    148.413159
f    403.428793
dtype: float64

In [31]:
# original Series with ndarray
s

a    1
b    2
c    3
d    4
e    5
f    6
dtype: int64

In [32]:
# slice and add
s[1:] + s[:-1]

a     NaN
b     4.0
c     6.0
d     8.0
e    10.0
f     NaN
dtype: float64

In [33]:
# drop unlabeled
p = s[1:]+s[:-1]
p.dropna()

b     4.0
c     6.0
d     8.0
e    10.0
dtype: float64

In [34]:
# you can add a name
q = pd.Series(np.random.randn(5), name = 'something')
q

0    0.861288
1   -0.795209
2    1.517148
3    0.322677
4   -1.062977
Name: something, dtype: float64

In [35]:
q2 = q.rename('different')
q2

0    0.861288
1   -0.795209
2    1.517148
3    0.322677
4   -1.062977
Name: different, dtype: float64

In [36]:
q2.name

'different'

### DataFrame
DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. You can think of it like a spreadsheet or SQL table, or a dict of Series objects.
Accepts:
* Dict of 1D ndarrays, lists, dicts, or Series
* 2-D numpy.ndarray
* Structured or record ndarray
* A Series
* Another DataFrame

'index' parameter is row labels and 'column' is column labels

In [37]:
# from dict of Series
# labels are formed with the union
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [38]:
# only show certain indices and in certain order
d2 = pd.DataFrame(d, index=['d','b','a'])
d2

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [39]:
# only show certain columns and in certain order
d3 = pd.DataFrame(d, index=['d','b','a'], columns=['two'])
d3

Unnamed: 0,two
d,4.0
b,2.0
a,1.0


In [40]:
# return index 
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [41]:
# return columns
df.columns

Index(['one', 'two'], dtype='object')

In [42]:
#### DataFrame from dict of lists
d = {'one': [1.,2.,3.,4.], 'two':[5.,6.,7.,8.]}
pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,5.0
1,2.0,6.0
2,3.0,7.0
3,4.0,8.0


In [43]:
# change index (row labels)
pd.DataFrame(d, index=['a','b','c','d'])

Unnamed: 0,one,two
a,1.0,5.0
b,2.0,6.0
c,3.0,7.0
d,4.0,8.0


In [44]:
np.zeros(2,)

array([ 0.,  0.])