# Pandas

In [2]:
# import declarations
import numpy as np
import pandas as pd

### A Pandas Series

Series is a one-dimensional labeled array capable of holding any data type(ex. a Python dict, an ndarray, a scalar value (scalar is a single number like 5)

##### Creating a Series

In [3]:
# Make a Series using a dict
mydict = {'one':1, 'two':2, 'three':3}
s_dict = pd.Series(mydict)
s_dict

one      1
three    3
two      2
dtype: int64

In [4]:
# Make a Series using a tuple, note letters as labels
mytupl = (1,2,3)
s_tupl = pd.Series(mytupl, index=['a','b','c'])
s_tupl

a    1
b    2
c    3
dtype: int64

In [5]:
# Make Series using a list, also gave letters as labels
mylist = [1,2,3]
s_mylist = pd.Series(mylist, index=['a','b','c'])
s_mylist

a    1
b    2
c    3
dtype: int64

In [6]:
# Make a Series using an ndarray
s = pd.Series(np.array([1,2,3,4,5,6]), index=['a','b','c','d','e','f'])
s

a    1
b    2
c    3
d    4
e    5
f    6
dtype: int64

In [8]:
# return Series Index
s.index

Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')

In [9]:
# Make Series with time series - index automatically assigned by not specifying it
t = pd.Series(np.random.randn(5))
t

0    1.096365
1   -1.728710
2   -0.647542
3   -0.433075
4   -0.504868
dtype: float64

##### Series from Python Dict

In [11]:
# create a Series using a dict - keys become the axis labels
# NOTE, sorted according to keys
d = {'a':0., 'c':1., 'b':2.}
s_fromdict = pd.Series(d)
s_fromdict

a    0.0
b    2.0
c    1.0
dtype: float64

In [12]:
# add index manually can force an ordering
# note, index is a list
# note, NaN (Not a Number) added for missing value created by new key 'd'
s_fromdict_ordered = pd.Series(d, index=['b', 'c', 'd', 'a'])
s_fromdict_ordered

b    2.0
c    1.0
d    NaN
a    0.0
dtype: float64

##### Series from a Scalar

In [13]:
# a scalar is just a fucking number
# scalar is just copied into the number of indexes
s_fromscalar = pd.Series(5., index=['a','b','c','d','e'])
s_fromscalar

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

### A Series is ndarray-like

In [14]:
# my original Series using an ndarray
s

a    1
b    2
c    3
d    4
e    5
f    6
dtype: int64

In [15]:
# access the 0th
s[0]

1

In [16]:
# so I can slice
s[:3]

a    1
b    2
c    3
dtype: int64

In [21]:
# return a boolean test across the Series
s > s.median()

a    False
b    False
c    False
d     True
e     True
f     True
dtype: bool

In [22]:
# use the boolean test to return only certain rows
s[s > s.median()]

d    4
e    5
f    6
dtype: int64

In [23]:
# return only certain indices according to a list
s[[4,3,1]]

e    5
d    4
b    2
dtype: int64

In [24]:
# apply an operation to each item in the Series
# the exponential function e^x (irrational number to the xth)
np.exp(s)

a      2.718282
b      7.389056
c     20.085537
d     54.598150
e    148.413159
f    403.428793
dtype: float64

### A Series is dict-like

In [26]:
# select a value with label-based selection
s['a']

1

In [27]:
# test if a value is in the Series
'e' in s

True

In [28]:
'z' in s

False

In [31]:
# select single value from Series using the label and .get
s.get('e')

5

### Vectorized Operations and Label Alignment with Series

In [37]:
# my original Series using an ndarray
s

a    1
b    2
c    3
d    4
e    5
f    6
dtype: int64

In [38]:
# add each Series Value to itself
s + s

a     2
b     4
c     6
d     8
e    10
f    12
dtype: int64

In [40]:
# multiply by a value
s * 2

a     2
b     4
c     6
d     8
e    10
f    12
dtype: int64

In [42]:
# exponentiation of e to the nth
np.exp(s)

a      2.718282
b      7.389056
c     20.085537
d     54.598150
e    148.413159
f    403.428793
dtype: float64

In [43]:
# my original Series with ndarray again
s

a    1
b    2
c    3
d    4
e    5
f    6
dtype: int64

In [44]:
# slice and add starting from 1th on left and 4th on right
s[1:] + s[:-1]

a     NaN
b     4.0
c     6.0
d     8.0
e    10.0
f     NaN
dtype: float64

In [46]:
# same operation dropping NaN
p = s[1:] + s[:-1]
p.dropna()

b     4.0
c     6.0
d     8.0
e    10.0
dtype: float64

### Change Properties of a Series

In [49]:
# give Series a name
q = pd.Series(np.random.randn(5), name = 'something')
q

0    0.318144
1    0.581578
2    0.635075
3    0.746876
4   -0.603132
Name: something, dtype: float64

In [50]:
# rename a Series
q2 = q.rename('different')
q2

0    0.318144
1    0.581578
2    0.635075
3    0.746876
4   -0.603132
Name: different, dtype: float64

### Apply a function Across a Series

In [51]:
# function returns a number multiplied by 100
def mult(num):
    return num * 100

In [53]:
# apply to each item in the series
q2.apply(mult)

0    31.814357
1    58.157850
2    63.507529
3    74.687569
4   -60.313197
Name: different, dtype: float64

j

### A Pandas DataFrame
DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. You can think of it like a spreadsheet or SQL table, or a dict of Series objects.
Accepts:
* Dict of 1D ndarrays, lists, dicts, or Series
* 2-D numpy.ndarray
* Structured or record ndarray
* A Series
* Another DataFrame

'index' parameter is row labels and 'column' is column labels

In [58]:
# make DataFrame from dict of Series
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [61]:
# new DataFrame only with certain indices
d2 = pd.DataFrame(d, index=['d','b','a'])
d2

Unnamed: 0,one,two
d,,4
b,2.0,2
a,1.0,1


In [64]:
# new DataFrame only with certain indices and column
d3 = pd.DataFrame(d, index=['d','b','a'], columns=['two'])
d3

Unnamed: 0,two
d,4
b,2
a,1


In [65]:
# return index 
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [66]:
# return columns
df.columns

Index(['one', 'two'], dtype='object')

In [67]:
# return data types
df.dtypes

one    float64
two      int64
dtype: object

In [68]:
#### DataFrame from dict of lists
d = {'one': [1.,2.,3.,4.], 'two':[5.,6.,7.,8.]}
pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,5.0
1,2.0,6.0
2,3.0,7.0
3,4.0,8.0


In [69]:
# change index (row labels)
pd.DataFrame(d, index=['a','b','c','d'])

Unnamed: 0,one,two
a,1.0,5.0
b,2.0,6.0
c,3.0,7.0
d,4.0,8.0


### Pandas Vectorized Operations and Other Operations

In [78]:
# a DataFrame
df

Unnamed: 0,one,two,sum,stdev,new_col
a,1.0,1,2.0,0.57735,1.0
c,3.0,3,6.0,1.732051,3.0
d,0.0,4,4.0,0.0,4.0


In [79]:
# vectorized operation with .sum()
# axis=1 indicates operation across index (rows)
df['sum'] = df[['one','two']].sum(axis=1)
df['stdev'] = df[['one','two','sum']].std(axis=1)
df

Unnamed: 0,one,two,sum,stdev,new_col
a,1.0,1,2.0,0.57735,1.0
c,3.0,3,6.0,1.732051,3.0
d,0.0,4,4.0,2.309401,4.0


In [84]:
# inplace fill NaN with values
df['one'].fillna(0.0, inplace=True)
df

Unnamed: 0,one,two,sum,stdev,new_col
a,1.0,1,2.0,0.57735,1.0
c,3.0,3,6.0,1.732051,3.0
d,0.0,4,4.0,2.309401,4.0


In [86]:
# add a mixed row to the DataFrame with some NaN
df['new_col'] = [1,np.nan,3]
df

Unnamed: 0,one,two,sum,stdev,new_col
a,1.0,1,2.0,0.57735,1.0
c,3.0,3,6.0,1.732051,
d,0.0,4,4.0,2.309401,3.0


In [87]:
# now inplace remove only the row with Nan
df.dropna(subset=['new_col'], inplace=True)
df

Unnamed: 0,one,two,sum,stdev,new_col
a,1.0,1,2.0,0.57735,1.0
d,0.0,4,4.0,2.309401,3.0


In [88]:
# returns a single column as a Pandas Series
x = df['one']
print(x)
type(x)

a    1.0
d    0.0
Name: one, dtype: float64


pandas.core.series.Series

In [89]:
# returns a numpy 1D array
x = df['one'].values
print(x)
type(x)

[ 1.  0.]


numpy.ndarray

In [90]:
# show DataFrame again
df

Unnamed: 0,one,two,sum,stdev,new_col
a,1.0,1,2.0,0.57735,1.0
d,0.0,4,4.0,2.309401,3.0


In [94]:
# keep anything with criteria met (a list of multiple values for criteria is ok)
criteria = [1.0]
df = df[df['one'].isin(criteria)]
df

Unnamed: 0,one,two,sum,stdev,new_col
a,1.0,1,2.0,0.57735,1.0


In [118]:
# add new rows back into the DataFrame using .concat

# first make a couple new DataFrames from df
df2 = df
df3 = df
df = pd.concat([df, df2, df3])
df

Unnamed: 0,one,two,sum,stdev,new_col
a,1.0,1,2.0,0.57735,1.0
a,1.0,1,2.0,0.57735,1.0
a,1.0,1,2.0,0.57735,1.0


In [119]:
# now change up some values
df[:1] = df[:1].apply(lambda x: x+2)
df[2:] = df[2:].apply(lambda x: x*100)

In [121]:
# now df is ready to play with
df

Unnamed: 0,one,two,sum,stdev,new_col
a,3.0,3,4.0,2.57735,3.0
a,1.0,1,2.0,0.57735,1.0
a,100.0,100,200.0,57.735027,100.0


In [124]:
# remove anything with criteria met
# notice only difference with removing is the tilde
criteria = [1.0]
df = df[~df['one'].isin(criteria)]
df

Unnamed: 0,one,two,sum,stdev,new_col
a,3.0,3,4.0,2.57735,3.0
a,100.0,100,200.0,57.735027,100.0


### Use Regex and Pandas functions to manipulate string elements

### replace

In [126]:
# create a new DataFrame from dict with Series
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']),
    'three' : pd.Series(['($1.22)','($3.44)','($6.44)','($9.33)'], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df

Unnamed: 0,one,three,two
a,1.0,($1.22),1
b,2.0,($3.44),2
c,3.0,($6.44),3
d,,($9.33),4


In [127]:
# use Regex to replace each value in column 'three' with parens and $ removed
# note, this is a vectorized operation
df.three = df.three.str.replace('[$,(,)]','')
df

Unnamed: 0,one,three,two
a,1.0,1.22,1
b,2.0,3.44,2
c,3.0,6.44,3
d,,9.33,4
