# Pandas
Pandas is a high level data structure built on Numpy, and make it easy to use in NumPy centric applications.

## Series
A Series is a one-dimensional array-like object with associated array of data labels. 

In [8]:
# Load array from list
from pandas import Series, DataFrame
import pandas as pd
series = Series([4, 7, -5, 3])
print("series = ", series)
print("series.index = ", list(series.index))
print("series.values = ", series.values)

series = Series([4, 7, -5, 3], index= ['a', 'b', 'c', 'd'])
print("series with new indexes = ", series)
print("series[[a , c]] = ", series[['a' , 'c']])

series =  0    4
1    7
2   -5
3    3
dtype: int64
series.index =  [0, 1, 2, 3]
series.values =  [ 4  7 -5  3]
series with new indexes =  a    4
b    7
c   -5
d    3
dtype: int64
series[[a , c]] =  a    4
c   -5
dtype: int64


#### Convert dictionary to series

In [14]:
from pandas import Series, DataFrame
import pandas as pd
data = {'Ohio':35000, 'Texas': 71000, 'Oregon': 16000, 'Utah' : 5000}
series = Series(data)
print("series = ", series)
states = ['California', 'Ohio', 'Oregon', 'Texas']
slices = Series(data, index = states)
print("slices = ", slices)
print("pd.isnull(slices):", pd.isnull(slices))
print("pd.notnull(slices):", pd.notnull(slices))
print("series + slices = ", series + slices)


series =  Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
slices =  California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
pd.isnull(slices): California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
pd.notnull(slices): California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool
series + slices =  California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64


In [2]:
from pandas import Series, DataFrame
import pandas as pd
data = {'Ohio':35000, 'Texas': 71000, 'Oregon': 16000, 'Utah' : 5000}
series = Series(data)
series.name = 'population'
series.index.name = 'state'
print("series = ", series)

series =  state
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
Name: population, dtype: int64


## DataFrame 
A DataFrame is a tabular, row-column data structure. It contains a collection of columns and a list of rows. The DataFrame support both row and column wise index.

#### DataFrame Constructor
| Type | Notes |
| --- | --- |
| 2D ndarray | A matrix of data, passing optional row and column labels |
| dict of arrays, list, or tuples | Each sequence becomes a column in the DataFrame. All Sequences must be the same length. |
| NumPy structured array | Treated as the 'dict of arrays' case |
| dict of Series | Each value becomes a column. Indexes from each Series are unioned together to from the result's row index if no explicit index is passed. |
| list of dicts or Series | Each item becomes a row in the DataFrame. Union of dict keys or Series indexes become the DataFrame's column labels |
| List of lists or tuples | Treated as the '2D ndarray' case
| Another DataFrame | The DataFrame's indexes are used unless different ones are passed |
| NumPy MaskedArray | Like the '2D ndarray' case except masked values become NA in the DataFrame result | 

In [23]:
from pandas import Series, DataFrame
import pandas as pd
data = {'state' : ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year' : [2000, 2001, 2002, 2001, 2002],
        'pop' : [1.5, 1.7, 3.6, 2.4, 2.9]
        }
frame = DataFrame(data)
print("frame = ", frame)

# reorder columns
frame1 = DataFrame(data, columns = ['year', 'state', 'pop'])
frame1.index = ['one', 'two', 'three', 'four', 'five']
print("frame1 = ", frame1)
print("frame1.loc['three'] = ", frame1.loc['three'])

frame =      state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
frame1 =         year   state  pop
one    2000    Ohio  1.5
two    2001    Ohio  1.7
three  2002    Ohio  3.6
four   2001  Nevada  2.4
five   2002  Nevada  2.9
frame1.loc['three'] =  year     2002
state    Ohio
pop       3.6
Name: three, dtype: object


In [158]:
from pandas import Series, DataFrame
import pandas as pd
data = {
    'Nevada' : {2001 : 2.4, 2002 : 2.9},
    'Ohio' : {2000 : 1.5, 2001 : 1.7, 2002 : 3.6}
    }
frame = DataFrame(data)
print("frame = ", frame)
print("frame.T = ", frame.T)

frame1 = DataFrame(data, index=[2001, 2002, 2003])
frame1.index.name = 'year'
frame1.columns.name = 'state'
print("frame1 = ", frame1)

frame2 = DataFrame({'Ohio' : frame['Ohio'][:1],
                    'Nevada' : frame['Nevada'][:2]})
print("frame2 = ", frame2)


frame =        Nevada  Ohio
2001     2.4   1.7
2002     2.9   3.6
2000     NaN   1.5
frame.T =          2001  2002  2000
Nevada   2.4   2.9   NaN
Ohio     1.7   3.6   1.5
frame1 =  state  Nevada  Ohio
year               
2001      2.4   1.7
2002      2.9   3.6
2003      NaN   NaN
frame2 =        Ohio  Nevada
2001   1.7     2.4
2002   NaN     2.9


#### Index Objects in pandas
| Class | Description |
| --- | --- |
| Index | The most general index object, representing axis labels in a NumPy array of Python Objects. |
| Int64Index | Specialize Index for integer values |
| MultiIndex | 'Hierachical' index object representing multiple levels of indexing on a single axis. Can be thought of as similar to an array of tuples |
| DatetimeIndex | Stores nanosecond timestamps (NumPy's datetime64 dtype). |
| PeriodIndex | Specialized Index for Period data (timespans) |

#### Index methods and properties
| Method | Description |
| --- | --- |
| append | Concatenate with additional Index object, producing a new Index |
| diff | Compute set difference as an Index |
| intersection | Compute set intersection |
| union | Compute set union |
| isin | Compute boolean array indicating whether each value is contained in the passed collection |
| delete | Compute new Index with element at index i deleted |
| drop | Compute new Index by deleting passed values |
| insert | Compute new Index by inserting element at index i |
| is_monotonic | Return True if each element is greater than or equal to the previous element. |
| is_unique | Return True if the Index has no duplicate values |
| unique | Compute the array of unique values in the Index |
| reindex | Reindex series by expanding or truncating the index |

#### Reindexing

In [50]:
# reindex series
from pandas import Series, DataFrame
import pandas as pd
series = Series([4.5, 7.2, -5.3, 3.6], index=['d','b', 'a', 'c'])
print("Before reindex")
print("series.index = ", list(series.index))
print("series.values = ", series.values)
series = series.reindex(['a', 'b', 'c', 'd', 'e'])
print("After reindex")
print("series.index = ", list(series.index))
print("series.values = ", series.values)
print("series.drop('c')")
series = series.drop('e')
print("series.index = ", list(series.index))
print("series.values = ", series.values)

Before reindex
series.index =  ['d', 'b', 'a', 'c']
series.values =  [ 4.5  7.2 -5.3  3.6]
After reindex
series.index =  ['a', 'b', 'c', 'd', 'e']
series.values =  [-5.3  7.2  3.6  4.5  nan]
series.drop('c')
series.index =  ['a', 'b', 'c', 'd']
series.values =  [-5.3  7.2  3.6  4.5]


In [77]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
data = DataFrame(np.arange(16).reshape((4, 4)), index = ['Ohio', 'Colorado', 'Utah', 'New York'], columns = (['one', 'two', 'three', 'four']))
print("data = ", data)
#drop rows
print("data.drop(['Colorado','Ohio']) = ", data.drop(['Colorado','Ohio']))
#drop columns
print("data.drop(['two','four'], axis = 1) = ", data.drop(['two','four'], axis = 1))

data =            one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
data.drop(['Colorado','Ohio']) =            one  two  three  four
Utah        8    9     10    11
New York   12   13     14    15
data.drop(['two','four'], axis = 1) =            one  three
Ohio        0      2
Colorado    4      6
Utah        8     10
New York   12     14


In [98]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
df = DataFrame(np.arange(16).reshape((4, 4)), index = ['Ohio', 'Colorado', 'Utah', 'New York'], columns = (['one', 'two', 'three', 'four']))
print("df = ", df)

print("df.loc['Colorado']", df.loc['Colorado']) 
print("df.loc[['Ohio', 'Colorado']][['one', 'three']]", df.loc[['Ohio', 'Colorado']][['one', 'three']]) 
print("df['three']", df['three']) 

df =            one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
df.loc['Colorado'] one      4
two      5
three    6
four     7
Name: Colorado, dtype: int32
df.loc[['Ohio', 'Colorado']]           one  three
Ohio        0      2
Colorado    4      6
df['three'] Ohio         2
Colorado     6
Utah        10
New York    14
Name: three, dtype: int32


#### Observation 
1. select a list of rows we should use loc method with a list of row id.
2. if we only select one row it return a distionary with column : value


#### Arithmetic and data alignment

arithmetic methods
| Method | Description |
| --- | --- |
| add | Method for addition(+) |
| sub | Method for substraction(-) |
| div | Method for division(/) |
| mul | Method for multiplication(*) |

In [101]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

# add two data frames
df1 = DataFrame(np.arange(9).reshape(3,3), columns=list("bcd"), index=['Ohio', 'Texas', 'Colorado'])
df2 = DataFrame(np.arange(12).reshape(4,3), columns=list("bde"), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print('df1 = ', df1)
print('df2 = ', df2)
print('df1 + df2 = ', df1 + df2)
print("df1.add(df2, fill_value=0)", df1.add(df2, fill_value=0))


df1 =            b  c  d
Ohio      0  1  2
Texas     3  4  5
Colorado  6  7  8
df2 =          b   d   e
Utah    0   1   2
Ohio    3   4   5
Texas   6   7   8
Oregon  9  10  11
df1 + df2 =              b   c     d   e
Colorado  NaN NaN   NaN NaN
Ohio      3.0 NaN   6.0 NaN
Oregon    NaN NaN   NaN NaN
Texas     9.0 NaN  12.0 NaN
Utah      NaN NaN   NaN NaN
df1.add(df2, fill_value=0)             b    c     d     e
Colorado  6.0  7.0   8.0   NaN
Ohio      3.0  1.0   6.0   5.0
Oregon    9.0  NaN  10.0  11.0
Texas     9.0  4.0  12.0   8.0
Utah      0.0  NaN   1.0   2.0


#### Observation 
1. If we want to do arithemtic operations on two data frame, we should use fill_value

#### Function application and mapping

In [115]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
def f(x) :
    return Series([x.min(), x.max()], index=['min', 'max'])
df = DataFrame(np.arange(12).reshape(4,3), columns=list("bde"), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print("Before function")
print(df)
df2 = df.apply(f)
print("")
print("After function")
print(df2)
print("f = lambda x : x.max() - x.min()")
print("df.apply(f)")
df3 = df.apply(f)
print(df3)

Before function
        b   d   e
Utah    0   1   2
Ohio    3   4   5
Texas   6   7   8
Oregon  9  10  11

After function
     b   d   e
min  0   1   2
max  9  10  11
f = lambda x : x.max() - x.min()
df.apply(f)
     b   d   e
min  0   1   2
max  9  10  11


#### Sort and Ranking

In [136]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
frame = DataFrame({'b' : [4, 7, -3, -2], 'a' : [0, 1, 0, 1]}, index=[0,1,2,3])
print("frame =", frame)
frame = frame.sort_index(axis = 1)
print("frame.sort_values(by=['a', 'b'])", frame.sort_values(by=['a', 'b']))
frame['rank_a'] = frame['a'].rank()
frame['rank_b'] = frame['b'].rank(method='max')
print(frame)


frame =    b  a
0  4  0
1  7  1
2 -3  0
3 -2  1
frame.sort_values(by=['a', 'b'])    a  b
2  0 -3
0  0  4
3  1 -2
1  1  7
   a  b  rank_a  rank_b
0  0  4     1.5     3.0
1  1  7     3.5     4.0
2  0 -3     1.5     1.0
3  1 -2     3.5     2.0


#### handle NaN data
| Method | Description |
| --- | --- |
| dropna | Filter axis labels based on whether values for each label having missing data, with varying thresholds for how much missing data to tolerate. |
| fillna | Fill in missing data with some value or using an interpolation method such as 'ffill' or 'bfill'. |
| isnull | return boolean values indicating which values are missing N/A. |
| notnull | Negation of isnull |

In [159]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
from numpy import NaN
data = DataFrame([[1.0, 6.5, 3.0], [1.0, NaN, NaN], [NaN, NaN, NaN], [NaN, 6.2, 3.0]])
print("data = ", data)
print("data.dropna():\n", data.dropna())
print("data.dropna(how='all'):\n", data.dropna(how='all'))
print("data.dropna(thresh=2):\n", data.dropna(thresh=2))
print("data.fillna(0):\n", data.fillna(0))

data =       0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.2  3.0
data.dropna():
      0    1    2
0  1.0  6.5  3.0
data.dropna(how='all'):
      0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
3  NaN  6.2  3.0
data.dropna(thresh=2):
      0    1    2
0  1.0  6.5  3.0
3  NaN  6.2  3.0
data.fillna(0):
      0    1    2
0  1.0  6.5  3.0
1  1.0  0.0  0.0
2  0.0  0.0  0.0
3  0.0  6.2  3.0


### DataFrame Merge
DataFrame Merge is a database style join operation. It can happen on some key columns or on index.

#### Merge Function

Merge Function Argument
| Argument | Description |
| --- | --- |
| left | DataFrame to be merged on the left side |
| right | DataFrame to be merged on the right side |
| how | One of 'inner', 'outer', 'left', or 'right'. 'inner' by default |
| on | Column names to join on. Must be found in both DataFrame objects. If not specified and no other join keys given, will use the intersection of the column names in left and right as the join keys |
| left_on | Columns in left DataFrame to use as join keys |
| right_on | Analogous to left_on for left DataFrame |
| left_index | Use row index in left as its join key (or keys, if a MultiIndex)
| right_index | Analogous to left_index
| Sort | Sort merged data lexicographically by join keys; True bu default. Disable to get better performance in some cases on large datasets. |
| suffixes | Tuple of string values to append to column names in case of overlap; default to ('_x', '_y'. For example if 'data' in both DataFrame objects, would appear as 'data_x' and 'data_y' in result. |
| copy | If False, avoid copying data into resulting data structure in some exceptional cases. By default always copies. |



In [12]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
from numpy import NaN
df1 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data':range(7)})
df2 = DataFrame({'rkey': ['a', 'b', 'c', 'd'], 'data':range(4)})
df3 = pd.merge(df1, df2, left_on = 'lkey', right_on = 'rkey', how='outer');
print("df1 = ", df1)
print("df2 = ", df2)
print("df3 = ", df3)

df1 =    lkey  data
0    b     0
1    b     1
2    a     2
3    c     3
4    a     4
5    a     5
6    b     6
df2 =    rkey  data
0    a     0
1    b     1
2    c     2
3    d     3
df3 =    lkey  data_x rkey  data_y
0    b     0.0    b       1
1    b     1.0    b       1
2    b     6.0    b       1
3    a     2.0    a       0
4    a     4.0    a       0
5    a     5.0    a       0
6    c     3.0    c       2
7  NaN     NaN    d       3


#### Join on index

In [13]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
from numpy import NaN
left = DataFrame([[1,2],[3,4],[5,6]], index=['a', 'c', 'e'], columns=['Ohio', 'Nevada'])
right = DataFrame([[7,8],[9,10],[11,12],[13,14]], index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])
print("left = \n", left)
print("right = \n", right)
left.join(right, how='outer')

left = 
    Ohio  Nevada
a     1       2
c     3       4
e     5       6
right = 
    Missouri  Alabama
b         7        8
c         9       10
d        11       12
e        13       14


Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


#### Concatenating Along and Axis

#### concat function argument
| Argument | Description |
| --- | --- |
| objs | List or dict of pandas objects to be concatenated. The only required argument. |
| axis | Axis to concatenate along; default to 0 |
| join | One of 'inner', 'outer', defaulting to 'outer'; whether to intersection (inner) or union (outer) together indexes along the other axes |
| join_axes | Specific indexes to use for the other n - 1 axes instead of performing union / intersection logic. |
| keys | Values to associate with objects being concatenated, forming a hierachical index along the concatenation axis. Can either be a list or arrat of arbitrary values, an array of tuples. or a list of arrays (if multiple level arrays passed in levels) |
| levels | Specific indexes to use as hierarchical index level or levels if keys passed. |
| names | Names for created hierarchical levels if keys and / or levels passed. |
| verify_integrity | Check new axis in concatenated object for duplicates and raise exception if so. By default (False) allows duplicates. |
| ignore_index | Do not preserve indexes along concatenation axis, instead producing a new range (total_length) index |

In [23]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
from numpy import NaN
s1 = Series([0, 1], index=['a', 'b'])
s2 = Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = Series([5, 6], index=['g', 'f'])
print("pd.concat([s1, s2, s3], ignore_index=True)")
s4 = pd.concat([s1, s2, s3], ignore_index=True)
print(s4)
print("pd.concat([s1, s2, s3], axis=1)")
s5 = pd.concat([s1, s2, s3], axis=1)
print(s5)
print("pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])")
s6 = pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])
print(s6)

pd.concat([s1, s2, s3], ignore_index=True)
0    0
1    1
2    2
3    3
4    4
5    5
6    6
dtype: int64
pd.concat([s1, s2, s3], axis=1)
     0    1    2
a  0.0  NaN  NaN
b  1.0  NaN  NaN
c  NaN  2.0  NaN
d  NaN  3.0  NaN
e  NaN  4.0  NaN
g  NaN  NaN  5.0
f  NaN  NaN  6.0
pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])
   one  two  three
a  0.0  NaN    NaN
b  1.0  NaN    NaN
c  NaN  2.0    NaN
d  NaN  3.0    NaN
e  NaN  4.0    NaN
g  NaN  NaN    5.0
f  NaN  NaN    6.0


#### Combination Data with Overlap

In [24]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan], index=['a', 'b', 'c', 'd', 'e', 'f'])
b = Series(np.arange(len(a), dtype=np.float64))
c = np.where(pd.notnull(a), a, b)
print("np.where(pd.notnull(a), a, b)")
print(c)

np.where(pd.notnull(a), a, b)
[0.  2.5 2.  3.5 4.5 5. ]


### Reshaping and Pivoting

#### stack() and unstack()

In [30]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
data = DataFrame(np.arange(6).reshape((2,3)), 
                index=pd.Index(['Ohio', 'Colorado'], name='state'), 
                columns=pd.Index(['one', 'two', 'three'], name='number'))
print(data)
print("data.stack()")
df1 = data.stack()
print(df1)
# unstack inner level
print("df1.unstack()")
df2 = df1.unstack()
print(df2)
# unstack outer level
print("df1.unstack(0)")
df3 = df1.unstack(0)
print(df3)

number    one  two  three
state                    
Ohio        0    1      2
Colorado    3    4      5
data.stack()
state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32
df1.unstack()
number    one  two  three
state                    
Ohio        0    1      2
Colorado    3    4      5
df1.unstack(0)
state   Ohio  Colorado
number                
one        0         3
two        1         4
three      2         5


#### Pivoting "long" to "wide" format

In [35]:
from pandas import Series, DataFrame
import pandas as pd
data = {'sequence' : [1, 1, 1, 2, 2, 2, 3, 3, 3],
        'item' : ['gdp', 'infl', 'unemp', 'gdp', 'infl', 'unemp', 'gdp', 'infl', 'unemp'],
        'values' : [2710, 0.0, 5.8, 2778, 2.34, 5.10, 2775, 2.74, 5.3]
        }
df = DataFrame(data)
print(df)
pivoted = df.pivot(index='sequence', columns='item', values='values')
print(pivoted)

   sequence   item   values
0         1    gdp  2710.00
1         1   infl     0.00
2         1  unemp     5.80
3         2    gdp  2778.00
4         2   infl     2.34
5         2  unemp     5.10
6         3    gdp  2775.00
7         3   infl     2.74
8         3  unemp     5.30
item         gdp  infl  unemp
sequence                     
1         2710.0  0.00    5.8
2         2778.0  2.34    5.1
3         2775.0  2.74    5.3


### Data Transformation

#### Removing Duplicates

In [44]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
data = DataFrame({'k1' : ['one'] * 3 + ['two'] * 4, 
                  'k2' : [1, 1, 2, 3, 3, 4, 4]})
df1 = data[data.duplicated()]
print("duplicated()")
print(df1)
print("drop_duplicates()")
df2 = data.drop_duplicates()
print(df2)
print("drop_duplicates(['k1'])")
df3 = data.drop_duplicates(['k1'])
print(df3)

duplicated()
    k1  k2
1  one   1
4  two   3
6  two   4
drop_duplicates()
    k1  k2
0  one   1
2  one   2
3  two   3
5  two   4
drop_duplicates(['k1'])
    k1  k2
0  one   1
3  two   3


#### Mapping data

In [53]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
data = DataFrame({'food' : ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 
                            'Bacon', 'pastrami', 'honey ham', 'noval lox'], 
                  'ounces' : [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

meat_to_animal = {
    'bacon': 'pig',
    'pulled pork' : 'pig',
    'pastrami' : 'cow',
    'corned beef' : 'cow',
    'honey ham' : 'pig',
    'noval lox' : 'salmon'
    }
data['animal'] = data['food'].map(lambda x : meat_to_animal[x.lower()])
data['animal1'] = data['food'].map(str.lower).map(meat_to_animal)
print(data)

          food  ounces  animal animal1
0        bacon     4.0     pig     pig
1  pulled pork     3.0     pig     pig
2        bacon    12.0     pig     pig
3     Pastrami     6.0     cow     cow
4  corned beef     7.5     cow     cow
5        Bacon     8.0     pig     pig
6     pastrami     3.0     cow     cow
7    honey ham     5.0     pig     pig
8    noval lox     6.0  salmon  salmon


#### Replacing values

In [59]:
from pandas import Series
import numpy as np
data = Series([1., -999., 2, -999., -1000, 3])
print(data)
print("data.replace([-999, -1000], [0, 0])")
df1 = data.replace([-999, -1000], [0, 0])
print(df1)
print("np.where(data < 0, 0, data)")
df2 = Series(np.where(data < 0, 0, data))
print(df2)

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64
data.replace([-999, -1000], [0, 0])
0    1.0
1    0.0
2    2.0
3    0.0
4    0.0
5    3.0
dtype: float64
np.where(data < 0, 0, data)
0    1.0
1    0.0
2    2.0
3    0.0
4    0.0
5    3.0
dtype: float64


#### Rename Axis Indexes

In [64]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
data = DataFrame(np.arange(12).reshape((3,4)), 
                 index=['Ohio', 'Colorado', 'New York'],
                 columns=['one', 'two', 'three', 'four'])
data.index = data.index.map(str.upper)
data.columns = data.columns.map(str.title)
print(data) 

          One  Two  Three  Four
OHIO        0    1      2     3
COLORADO    4    5      6     7
NEW YORK    8    9     10    11


#### Discretization and Binning

In [66]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
groups = ["Youth", "YoungAdult", "MiddleAged", "Senior"]
df1 = pd.cut(ages, bins, labels=groups, right=False) 
print(df1)

['Youth', 'Youth', 'YoungAdult', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']


#### Detect and Filter

In [96]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
data = DataFrame(np.arange(18).reshape((6,3)))
print(data[data[2]>10])

    0   1   2
3   9  10  11
4  12  13  14
5  15  16  17


#### Computing Indicator / Dummy Variables

In [98]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
df = DataFrame({'key' : ['b', 'b', 'a', 'c', 'a', 'b'],
                  'data1' :range(6)})
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,False,True,False
1,False,True,False
2,True,False,False
3,False,False,True
4,True,False,False
5,False,True,False
