# Handling Missing Data

In [3]:
import pandas
import numpy as np
from numpy import nan as NA

string_data = pandas.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
# .isnull() will return a boolean array
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

## Filtering Out Missing Data & Filling in Missing Data

In [7]:
# Series
data = pandas.Series([1, NA, 3.5, NA, 7])
clean = [
    data.dropna(), '','data[data.notnull()]',
    data[data.notnull()], '',''
]
clean

[0    1.0
 2    3.5
 4    7.0
 dtype: float64,
 '',
 'data[data.notnull()]',
 0    1.0
 2    3.5
 4    7.0
 dtype: float64,
 '',
 '']

In [17]:
# DataFrame's
data = pandas.DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
data2 = pandas.Series([1., NA, 5, NA, 10])
df = pandas.DataFrame(np.random.randn(8,3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
clean = [
    data, '','',
    data.dropna(), '','data.dropna(how="all")',
    data.dropna(how='all'), '','df',
    df, '','df.dropna(thresh="2")',
    df.dropna(thresh=2), '','df.fillna(0)',
    df.fillna(0), '','df.fillna({1:0.5, 2:0}',
    df.fillna({1:0.5, 2:0}), '','.fillna(data2.mean())',
    df.fillna(data2.mean())
]
clean

[     0    1    2
 0  1.0  6.5  3.0
 1  1.0  NaN  NaN
 2  NaN  NaN  NaN
 3  NaN  6.5  3.0,
 '',
 '',
      0    1    2
 0  1.0  6.5  3.0,
 '',
 'data.dropna(how="all")',
      0    1    2
 0  1.0  6.5  3.0
 1  1.0  NaN  NaN
 3  NaN  6.5  3.0,
 '',
 'df',
           0         1         2
 0 -0.791521       NaN       NaN
 1 -0.269018       NaN       NaN
 2 -0.469925       NaN  1.772218
 3  0.397856       NaN -0.042834
 4  0.918176  0.428277 -0.639774
 5 -1.191737  2.444555  0.108843
 6  0.166983  0.932011 -2.659369
 7  0.008653  1.319615 -0.197740,
 '',
 'df.dropna(thresh="2")',
           0         1         2
 2 -0.469925       NaN  1.772218
 3  0.397856       NaN -0.042834
 4  0.918176  0.428277 -0.639774
 5 -1.191737  2.444555  0.108843
 6  0.166983  0.932011 -2.659369
 7  0.008653  1.319615 -0.197740,
 '',
 'df.fillna(0)',
           0         1         2
 0 -0.791521  0.000000  0.000000
 1 -0.269018  0.000000  0.000000
 2 -0.469925  0.000000  1.772218
 3  0.397856  0.000000 -0.0428

# Data Transformations
## Removing Dupes

In [21]:
data = pandas.DataFrame({'k1': ['one', 'two'] * 3 + ['two'], 'k2': [1, 1, 2, 3, 3, 4, 4]})
data['v1'] = range(7)
v = [
    data, '','data.duplicated()',
    data.duplicated(), '','data.drop_duplicate()',
    data.drop_duplicates(), '','',
    data.drop_duplicates(['k1']), '','',
    data.drop_duplicates(['k1', 'k2'], keep='last')
]
v

[    k1  k2  v1
 0  one   1   0
 1  two   1   1
 2  one   2   2
 3  two   3   3
 4  one   3   4
 5  two   4   5
 6  two   4   6,
 '',
 'data.duplicated()',
 0    False
 1    False
 2    False
 3    False
 4    False
 5    False
 6    False
 dtype: bool,
 '',
 'data.drop_duplicate()',
     k1  k2  v1
 0  one   1   0
 1  two   1   1
 2  one   2   2
 3  two   3   3
 4  one   3   4
 5  two   4   5
 6  two   4   6,
 '',
 '',
     k1  k2  v1
 0  one   1   0
 1  two   1   1,
 '',
 '',
     k1  k2  v1
 0  one   1   0
 1  two   1   1
 2  one   2   2
 3  two   3   3
 4  one   3   4
 6  two   4   6]

## Transforming Data Using Functions

In [24]:
data = pandas.DataFrame(
    {'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'],
     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]}
)
meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}
lowercased = data['food'].str.lower()
data['animal'] = lowercased.map(meat_to_animal)
lowercase_lambda = data['food'].map(lambda x: meat_to_animal[x.lower()])
o = [
    data, '','',
    data['food'], '','lower_lambda',
    lowercase_lambda
]
o

[          food  ounces  animal
 0        bacon     4.0     pig
 1  pulled pork     3.0     pig
 2        bacon    12.0     pig
 3     Pastrami     6.0     cow
 4  corned beef     7.5     cow
 5        Bacon     8.0     pig
 6     pastrami     3.0     cow
 7    honey ham     5.0     pig
 8     nova lox     6.0  salmon,
 '',
 '',
 0          bacon
 1    pulled pork
 2          bacon
 3       Pastrami
 4    corned beef
 5          Bacon
 6       pastrami
 7      honey ham
 8       nova lox
 Name: food, dtype: object,
 '',
 'lower_lambda',
 0       pig
 1       pig
 2       pig
 3       cow
 4       cow
 5       pig
 6       cow
 7       pig
 8    salmon
 Name: food, dtype: object]

## Replacing Values

In [27]:
data = pandas.Series([1., -999., 2., -999., -1000., 3.])
o = [
    data.replace(-999, np.nan), '','',
    data.replace([-999, -1000], np.nan), '','',
    data.replace({-999: np.nan, -1000: 0})
]
o

[0       1.0
 1       NaN
 2       2.0
 3       NaN
 4   -1000.0
 5       3.0
 dtype: float64,
 '',
 '',
 0    1.0
 1    NaN
 2    2.0
 3    NaN
 4    NaN
 5    3.0
 dtype: float64,
 '',
 '',
 0    1.0
 1    NaN
 2    2.0
 3    NaN
 4    0.0
 5    3.0
 dtype: float64]

## Renaming Axis Indexes

In [29]:
data = pandas.DataFrame(np.arange(12).reshape((3, 4)),
                        index=['Ohio', 'Colorado', 'New York'],
                        columns=['one', 'two', 'three', 'four'])
transform = lambda x: x[:4].upper()
data.index = data.index.map(transform)
o = [
    data.index.map(transform), '','',
    data, '','data.rename(index=str.title, column=str.upper)',
    data.rename(index=str.title, columns=str.upper), '','data.rename(index={"NEW ": "TEXAS"}, columns: {"three":"foobar"})',
    data.rename(index={'NEW ': 'TEXAS'}, columns={'three': 'foobar'})
]
o

[Index(['OHIO', 'COLO', 'NEW '], dtype='object'),
 '',
 '',
       one  two  three  four
 OHIO    0    1      2     3
 COLO    4    5      6     7
 NEW     8    9     10    11,
 '',
 'data.rename(index=str.title, column=str.upper)',
       ONE  TWO  THREE  FOUR
 Ohio    0    1      2     3
 Colo    4    5      6     7
 New     8    9     10    11,
 '',
 'data.rename(index={"NEW ": "TEXAS"}, columns: {"three":"foobar"})',
        one  two  foobar  four
 OHIO     0    1       2     3
 COLO     4    5       6     7
 TEXAS    8    9      10    11]

## Discretization & Binning

In [47]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
# .cut() actually does the discretization
cats = pandas.cut(ages, bins)
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
name_cats = pandas.cut(ages, bins, labels=group_names)
data = np.arange(16) + 1
discretize = [
    'categories',
    cats, '','.code',
    cats.codes, '','.categories',
    cats.categories, '','pandas.value_counts(cats)',
    pandas.value_counts(cats), '','pandas.cut(ages, bins, labels=group_names)',
    name_cats, '', 'data',
    data, '','pandas.cut(data, 4)',
    pandas.cut(data, 4, precision=1), '','',
    pandas.qcut(np.random.randn(1600), 4), '','',
    pandas.value_counts(pandas.qcut(np.random.randn(1000), 4)), '','',
    #pandas.qcut(data, [1,4,8,12,16])
]
discretize

['categories',
 [(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
 Length: 12
 Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]],
 '',
 '.code',
 array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8),
 '',
 '.categories',
 IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
               closed='right',
               dtype='interval[int64]'),
 '',
 'pandas.value_counts(cats)',
 (18, 25]     5
 (25, 35]     3
 (35, 60]     3
 (60, 100]    1
 dtype: int64,
 '',
 'pandas.cut(ages, bins, labels=group_names)',
 ['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
 Length: 12
 Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior'],
 '',
 'data',
 array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16]),
 '',
 'pandas.cut(data, 4)',
 [(1.0, 4.8], (1.0, 4.8], (1.0, 4.8], (1.0, 4.8], (4.8, 8.5], ...,

## Detecting & Filtering Outliers

In [51]:
data = pandas.DataFrame(np.random.randn(1000, 4))
col = data[2]
# cap data from -3 to 3
data[np.abs(data) > 3] = np.sign(data) * 3
o = [
   'data.describe()',
    data.describe(), '','col[np.abs(col) > 3]',
    col[np.abs(col) > 3], '','',
    data[(np.abs(data) > 3).any(1)], '','',

]
o

['data.describe()',
                  0            1            2            3
 count  1000.000000  1000.000000  1000.000000  1000.000000
 mean     -0.034751    -0.029958     0.006481     0.023851
 std       0.971793     0.976588     0.997318     0.990872
 min      -2.817455    -2.953818    -3.000000    -3.000000
 25%      -0.689653    -0.662738    -0.655261    -0.692755
 50%      -0.019004    -0.017435     0.055985     0.043285
 75%       0.643153     0.641272     0.685920     0.715960
 max       3.000000     2.986516     3.000000     2.706008,
 '',
 'col[np.abs(col) > 3]',
 Series([], Name: 2, dtype: float64),
 '',
 '',
 Empty DataFrame
 Columns: [0, 1, 2, 3]
 Index: [],
 '',
 '']

## Permutation & Random Sampling

In [None]:
df = pandas.DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
o = [
        
]
o

## Computing Indicator/Dummy Variables

In [None]:
#

# String Manipulation
## String Object Methods

In [None]:
#

## Regular Expressions

In [None]:
#

## Vectorized String Functions in pandas

In [None]:
#

##