# Pandas Objects

### A Series

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Series is a one-dimensional labeled array capable of holding
# any data type(ex. a Python dict, an ndarray, a scalar value (like 5)

In [3]:
# Series using a dict
mydict = {'one':1, 'two':2, 'three':3}
s_dict = pd.Series(mydict)
s_dict

one      1
three    3
two      2
dtype: int64

In [4]:
# Series using a tuple, note gave letters as labels
mytupl = (1,2,3)
s_tupl = pd.Series(mytupl, index=['a','b','c'])
s_tupl

a    1
b    2
c    3
dtype: int64

In [5]:
# Series using a list, also gave letters as labels
mylist = [1,2,3]
s_mylist = pd.Series(mylist, index=['a','b','c'])
s_mylist

a    1
b    2
c    3
dtype: int64

In [6]:
# Series using an ndarray
s = pd.Series(np.array([1,2,3,4,5,6]), index=['a','b','c','d','e','f'])
s

a    1
b    2
c    3
d    4
e    5
f    6
dtype: int64

In [7]:
# return index
s.index

Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')

In [8]:
# have index automatically assigned by not specifying it
t = pd.Series(np.random.randn(5))
t

0   -1.526269
1   -1.178559
2    0.718529
3    0.274425
4   -0.120063
dtype: float64

##### Using a dict

In [9]:
# using a dict, the keys become the axis labels
# NOTE, sorted according to keys
d = {'a':0., 'c':1., 'b':2.}
s_fromdict = pd.Series(d)
s_fromdict

a    0.0
b    2.0
c    1.0
dtype: float64

In [10]:
# adding an index manually can force an ordering
# note, index is a list
# note, NaN (Not a Number) added for missing value created by new key
s_fromdict_ordered = pd.Series(d, index=['b', 'c', 'd', 'a'])
s_fromdict_ordered

b    2.0
c    1.0
d    NaN
a    0.0
dtype: float64

##### Series using a scalar

In [11]:
# a scalar is just a fucking number
# scalar is just copied into the number of indexes
s_fromscalar = pd.Series(5., index=['a','b','c','d','e'])
s_fromscalar

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

##### Series is ndarray-like

In [12]:
# my original Series using an ndarray
s

a    1
b    2
c    3
d    4
e    5
f    6
dtype: int64

In [13]:
# access the 0th
s[0]

1

In [14]:
# so I can slice
s[:3]

a    1
b    2
c    3
dtype: int64

In [15]:
# can perform math inside which computes to an index start
# note, s.median() is 3.5
s[s > s.median()]

d    4
e    5
f    6
dtype: int64

In [16]:
# a list denotes indexes to be retrieved
s[[4,3,1]]

e    5
d    4
b    2
dtype: int64

In [17]:
# the exponential function e^x (irrational number to the xth)
np.exp(s)

a      2.718282
b      7.389056
c     20.085537
d     54.598150
e    148.413159
f    403.428793
dtype: float64

#### series is dict-like

In [18]:
# again, from my original Series from an ndarray
s['a']

1

In [19]:
'e' in s

True

In [20]:
'z' in s

False

In [21]:
# nothing output here
s.get('x')

In [22]:
# nan output
s.get('x', np.nan)

nan

##### vectorized operations and label alignment with Series

In [23]:
# my original Series using an ndarray
s

a    1
b    2
c    3
d    4
e    5
f    6
dtype: int64

In [24]:
s + s

a     2
b     4
c     6
d     8
e    10
f    12
dtype: int64

In [25]:
s * 2

a     2
b     4
c     6
d     8
e    10
f    12
dtype: int64

In [26]:
np.exp(s)

a      2.718282
b      7.389056
c     20.085537
d     54.598150
e    148.413159
f    403.428793
dtype: float64

In [27]:
# original Series with ndarray
s

a    1
b    2
c    3
d    4
e    5
f    6
dtype: int64

In [28]:
# slice and add
s[1:] + s[:-1]

a     NaN
b     4.0
c     6.0
d     8.0
e    10.0
f     NaN
dtype: float64

In [29]:
# drop unlabeled
p = s[1:]+s[:-1]
p.dropna()

b     4.0
c     6.0
d     8.0
e    10.0
dtype: float64

In [30]:
# you can add a name
q = pd.Series(np.random.randn(5), name = 'something')
q

0    0.414875
1   -0.519416
2   -0.285779
3    0.049953
4   -0.113682
Name: something, dtype: float64

In [31]:
q2 = q.rename('different')
q2

0    0.414875
1   -0.519416
2   -0.285779
3    0.049953
4   -0.113682
Name: different, dtype: float64

##### Apply a function across a Series

In [32]:
def mult(num):
    return num * 100

In [33]:
q2.apply(mult)

0    41.487545
1   -51.941650
2   -28.577891
3     4.995330
4   -11.368205
Name: different, dtype: float64

##### Numpy modify in place  with += operator

In [34]:
# modify in place performs vectorized operation in place
import numpy as np
a = np.array([1,2,3,4])
b = a
a += np.array([1,1,1,1])
print(b)

[2 3 4 5]


In [35]:
# just using the + operator creates a new variable
# so b was never changed by the time it prints
a = np.array([1,2,3,4])
b = a
a = a + np.array([1,1,1,1])
print(b)

[1 2 3 4]


### DataFrame
DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. You can think of it like a spreadsheet or SQL table, or a dict of Series objects.
Accepts:
* Dict of 1D ndarrays, lists, dicts, or Series
* 2-D numpy.ndarray
* Structured or record ndarray
* A Series
* Another DataFrame

'index' parameter is row labels and 'column' is column labels

In [36]:
# from dict of Series
# labels are formed with the union
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [37]:
# only show certain indices and in certain order
d2 = pd.DataFrame(d, index=['d','b','a'])
d2

Unnamed: 0,one,two
d,,4
b,2.0,2
a,1.0,1


In [38]:
# only show certain columns and in certain order
d3 = pd.DataFrame(d, index=['d','b','a'], columns=['two'])
d3

Unnamed: 0,two
d,4
b,2
a,1


In [39]:
# return index 
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [40]:
# return columns
df.columns

Index(['one', 'two'], dtype='object')

In [42]:
# return data types
df.dtypes

one    float64
two      int64
dtype: object

In [42]:
#### DataFrame from dict of lists
d = {'one': [1.,2.,3.,4.], 'two':[5.,6.,7.,8.]}
pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,5.0
1,2.0,6.0
2,3.0,7.0
3,4.0,8.0


In [43]:
# change index (row labels)
pd.DataFrame(d, index=['a','b','c','d'])

Unnamed: 0,one,two
a,1.0,5.0
b,2.0,6.0
c,3.0,7.0
d,4.0,8.0


In [44]:
np.zeros(2,)

array([ 0.,  0.])

### Pandas operations

!!! Find out difference between apply, applymap, and map

In [45]:
df

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [46]:
# vectorized operation with .sum()
df['sum'] = df[['one','two']].sum(axis=1)
df['stdev'] = df[['one','two','sum']].std(axis=1)
df

Unnamed: 0,one,two,sum,stdev
a,1.0,1,2.0,0.57735
b,2.0,2,4.0,1.154701
c,3.0,3,6.0,1.732051
d,,4,4.0,0.0


In [47]:
# fill NaN values
df['one'].fillna(0.0, inplace=True)
df

Unnamed: 0,one,two,sum,stdev
a,1.0,1,2.0,0.57735
b,2.0,2,4.0,1.154701
c,3.0,3,6.0,1.732051
d,0.0,4,4.0,0.0


In [48]:
# add a mixed row with some NaN
df['new_col'] = [1,np.nan,3,4]
df

Unnamed: 0,one,two,sum,stdev,new_col
a,1.0,1,2.0,0.57735,1.0
b,2.0,2,4.0,1.154701,
c,3.0,3,6.0,1.732051,3.0
d,0.0,4,4.0,0.0,4.0


In [49]:
# now remove only the row with Nan
df.dropna(subset=['new_col'], inplace=True)
df

Unnamed: 0,one,two,sum,stdev,new_col
a,1.0,1,2.0,0.57735,1.0
c,3.0,3,6.0,1.732051,3.0
d,0.0,4,4.0,0.0,4.0


In [50]:
# returns a single column as a Pandas Series
x = df['one']
print(x)
type(x)

a    1.0
c    3.0
d    0.0
Name: one, dtype: float64


pandas.core.series.Series

In [51]:
# retuerns a numpy 1D array
x = df['one'].values
print(x)
type(x)

[ 1.  3.  0.]


numpy.ndarray

### Remove or show only rows with certain conditions

In [52]:
df

Unnamed: 0,one,two,sum,stdev,new_col
a,1.0,1,2.0,0.57735,1.0
c,3.0,3,6.0,1.732051,3.0
d,0.0,4,4.0,0.0,4.0


In [53]:
# keep anything with criteria met
criteria = [1.0,0.0]
df = df[df['one'].isin(criteria)]
df

Unnamed: 0,one,two,sum,stdev,new_col
a,1.0,1,2.0,0.57735,1.0
d,0.0,4,4.0,0.0,4.0


In [54]:
# remove anything with criteria met
criteria = [1.0]
df = df[~df['one'].isin(criteria)]
df

Unnamed: 0,one,two,sum,stdev,new_col
d,0.0,4,4.0,0.0,4.0


In [55]:
df

Unnamed: 0,one,two,sum,stdev,new_col
d,0.0,4,4.0,0.0,4.0


### Use Regex and Pandas functions to manipulate string elements

##### replace

In [56]:
# from dict of Series
# labels are formed with the union
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']),
    'three' : pd.Series(['($1.22)','($3.44)','($6.44)','($9.33)'], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df

Unnamed: 0,one,three,two
a,1.0,($1.22),1
b,2.0,($3.44),2
c,3.0,($6.44),3
d,,($9.33),4


In [57]:
df.three = df.three.str.replace('[$,(,)]','')
df

Unnamed: 0,one,three,two
a,1.0,1.22,1
b,2.0,3.44,2
c,3.0,6.44,3
d,,9.33,4


### Vector operations:

#### Create dataframes

In [89]:
# dataframes from a dict
df1 = pd.DataFrame({'A': pd.Series([1,2,3,4],index=list('ABCD'),dtype='float32'),
                   'C': pd.Series([5,6,7,8],index=list('ABCD'),dtype='float32')})

df2 = pd.DataFrame({'X': pd.Series([100,200,300,400],index=list('ABCD'),dtype='float32')})

In [90]:
df1

Unnamed: 0,A,C
A,1.0,5.0
B,2.0,6.0
C,3.0,7.0
D,4.0,8.0


In [91]:
df2

Unnamed: 0,X
A,100.0
B,200.0
C,300.0
D,400.0


##### Perform vector operations

Multiplies values within a single dataframe to create new column: