In [1]:
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import Series, DataFrame
import seaborn as sns

# used for example for random
from numpy import *
# for matplot
%matplotlib inline

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
from IPython.display import display
from IPython.display import Image

Much of the programming work in data analysis and modeling is spent on data preparation:
loading, cleaning, transforming, and rearranging.

**=========================================================================================================================**

# Data Cleaning and Preparation

In [4]:
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

## Handling Missing Data

In [5]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
string_data.isnull()

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

0    False
1    False
2     True
3    False
dtype: bool

In [6]:
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### Filtering Out Missing Data

In [7]:
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [8]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [15]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()
# dropna is removing full row where there is NA data
data
cleaned
#data[0][0] = NA

#data
#cleaned = data.dropna()
#cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [18]:
help(DataFrame.dropna)

Help on function dropna in module pandas.core.frame:

dropna(self, axis=0, how='any', thresh=None, subset=None, inplace=False)
    Return object with labels on given axis omitted where alternately any
    or all of the data are missing
    
    Parameters
    ----------
    axis : {0 or 'index', 1 or 'columns'}, or tuple/list thereof
        Pass tuple or list to drop on multiple axes
    how : {'any', 'all'}
        * any : if any NA values are present, drop that label
        * all : if all values are NA, drop that label
    thresh : int, default None
        int value : require that many non-NA values
    subset : array-like
        Labels along other axis to consider, e.g. if you are dropping rows
        these would be a list of columns to include
    inplace : boolean, default False
        If True, do operation inplace and return None.
    
    Returns
    -------
    dropped : DataFrame
    
    Examples
    --------
    >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0], [3, 4, np.

In [19]:
# by defauly dropna have arg how = 'any', which means drop any row which contain NA
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [20]:
data[4] = NA
data
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [25]:
df = pd.DataFrame(np.random.randn(7, 3))
df
df[1]

Unnamed: 0,0,1,2
0,0.476985,3.248944,-1.021228
1,-0.577087,0.124121,0.302614
2,0.523772,0.00094,1.34381
3,-0.713544,-0.831154,-2.370232
4,-1.860761,-0.860757,0.560145
5,-1.265934,0.119827,-1.063512
6,0.332883,-2.359419,-0.199543


0    3.248944
1    0.124121
2    0.000940
3   -0.831154
4   -0.860757
5    0.119827
6   -2.359419
Name: 1, dtype: float64

In [32]:
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df
df.dropna()

# thresh --> int value : require that many non-NA values || at least 'thresh value' is not NaN
df.iloc[1,0] = NaN
df
print('0=======================================================')
df.dropna(thresh=0)
print('1=======================================================')
df.dropna(thresh=1)
print('2=======================================================')
df.dropna(thresh=2)
print('3=======================================================')
df.dropna(thresh=3)

Unnamed: 0,0,1,2
0,1.920784,,
1,-0.6794,,
2,-1.213851,,-0.919242
3,-0.838827,,-0.557805
4,-0.567455,-0.372642,-0.926557
5,1.755108,1.20981,1.270025
6,-0.974378,-0.634709,-0.395701


Unnamed: 0,0,1,2
4,-0.567455,-0.372642,-0.926557
5,1.755108,1.20981,1.270025
6,-0.974378,-0.634709,-0.395701


Unnamed: 0,0,1,2
0,1.920784,,
1,,,
2,-1.213851,,-0.919242
3,-0.838827,,-0.557805
4,-0.567455,-0.372642,-0.926557
5,1.755108,1.20981,1.270025
6,-0.974378,-0.634709,-0.395701




Unnamed: 0,0,1,2
0,1.920784,,
1,,,
2,-1.213851,,-0.919242
3,-0.838827,,-0.557805
4,-0.567455,-0.372642,-0.926557
5,1.755108,1.20981,1.270025
6,-0.974378,-0.634709,-0.395701




Unnamed: 0,0,1,2
0,1.920784,,
2,-1.213851,,-0.919242
3,-0.838827,,-0.557805
4,-0.567455,-0.372642,-0.926557
5,1.755108,1.20981,1.270025
6,-0.974378,-0.634709,-0.395701




Unnamed: 0,0,1,2
2,-1.213851,,-0.919242
3,-0.838827,,-0.557805
4,-0.567455,-0.372642,-0.926557
5,1.755108,1.20981,1.270025
6,-0.974378,-0.634709,-0.395701




Unnamed: 0,0,1,2
4,-0.567455,-0.372642,-0.926557
5,1.755108,1.20981,1.270025
6,-0.974378,-0.634709,-0.395701


### Filling In Missing Data

In [33]:
df.fillna(0)

Unnamed: 0,0,1,2
0,1.920784,0.0,0.0
1,0.0,0.0,0.0
2,-1.213851,0.0,-0.919242
3,-0.838827,0.0,-0.557805
4,-0.567455,-0.372642,-0.926557
5,1.755108,1.20981,1.270025
6,-0.974378,-0.634709,-0.395701


In [34]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,1.920784,0.5,0.0
1,,0.5,0.0
2,-1.213851,0.5,-0.919242
3,-0.838827,0.5,-0.557805
4,-0.567455,-0.372642,-0.926557
5,1.755108,1.20981,1.270025
6,-0.974378,-0.634709,-0.395701


In [35]:
_ = df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,1.920784,0.0,0.0
1,0.0,0.0,0.0
2,-1.213851,0.0,-0.919242
3,-0.838827,0.0,-0.557805
4,-0.567455,-0.372642,-0.926557
5,1.755108,1.20981,1.270025
6,-0.974378,-0.634709,-0.395701


In [36]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df
df.fillna(method='ffill')
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-0.289436,-0.734297,-0.728505
1,0.838775,0.266893,0.721194
2,0.910983,,-1.413416
3,1.296608,,1.127481
4,-0.568363,,
5,-1.168634,,


Unnamed: 0,0,1,2
0,-0.289436,-0.734297,-0.728505
1,0.838775,0.266893,0.721194
2,0.910983,0.266893,-1.413416
3,1.296608,0.266893,1.127481
4,-0.568363,0.266893,1.127481
5,-1.168634,0.266893,1.127481


Unnamed: 0,0,1,2
0,-0.289436,-0.734297,-0.728505
1,0.838775,0.266893,0.721194
2,0.910983,0.266893,-1.413416
3,1.296608,0.266893,1.127481
4,-0.568363,,1.127481
5,-1.168634,,1.127481


In [38]:
data = pd.Series([1., NA, 3.5, NA, 7])
data
data.fillna(data.mean())

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

## Data Transformation

So far in this chapter we’ve been concerned with rearranging data. Filtering, cleaning,
and other tranformations are another class of important operations.

### Removing Duplicates

In [39]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


The DataFrame method 
```python
duplicated()
```
returns a boolean Series indicating whether each
row is a duplicate or not

Relatedly, 
```python
drop_duplicates()
```
returns a DataFrame where the duplicated array is True:

In [40]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [41]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


Both of these methods by default consider all of the columns; alternatively you can
specify any subset of them to detect duplicates. Suppose we had an additional column
of values and wanted to filter duplicates only based on the 'k1' column:

In [45]:
data['v1'] = range(7)
data
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


duplicated and drop_duplicates by default keep the first observed value combination.
Passing take_last=True will return the last one:

In [48]:
data # duplicates for K1 and k2 in 5th and 6th row
data.drop_duplicates(['k1', 'k2']) # will keep 5th row
data.drop_duplicates(['k1', 'k2'], keep='last') # will keep 6th rom

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5


Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


### Transforming Data Using a Function or Mapping
For many data sets, you may wish to perform some transformation based on the values
in an array, Series, or column in a DataFrame. Consider the following hypothetical data
collected about some kinds of meat:

In [49]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


Suppose you wanted to add a column indicating the type of animal that each food came
from. Let’s write down a mapping of each distinct meat type to the kind of animal.

The 
```python
map()
```
method on a Series accepts a function or dict-like object containing a mapping,
but here we have a small problem in that some of the meats above are capitalized and
others are not. Thus, we also need to convert each value to lower case:

In [51]:
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}
meat_to_animal
data

{'bacon': 'pig',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon',
 'pastrami': 'cow',
 'pulled pork': 'pig'}

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [54]:
# not all data are exactly the same, sometimes uppercases are used in data set, so we need to at first unified
lowercased = data['food'].str.lower()
lowercased
data['animal'] = lowercased.map(meat_to_animal)
data

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [56]:
data['food'].map(lambda x: meat_to_animal[x.lower()])
data

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


### !!! Using map() is a convenient way to perform element-wise transformations and other data cleaning-related operations.

### Replacing Values

Filling in missing data with the fillna method can be thought of as a special case of
more general value replacement. While map, as you’ve seen above, can be used to modify
a subset of values in an object, 

```python
replace()
```
provides a simpler and more flexible way to do
so. Let’s consider this Series.

If you want to replace multiple values at once, you instead pass a list then the substitute
value.

In [57]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [58]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [59]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [62]:
data.replace([-999, -1000], [np.nan, 0])

#The argument passed can also be a dict:
data.replace({-999: np.nan, -1000: 0})


0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

### Renaming Axis Indexes

Like values in a Series, axis labels can be similarly transformed by a function or mapping
of some form to produce new, differently labeled objects. The axes can also be modified
in place without creating a new data structure. Here’s a simple example:

In [64]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [68]:
transform = lambda x: x[:4].upper()
data.index.map(transform)
data

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [69]:
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


If you want to create a transformed version of a data set without modifying the original,
a useful method is 
```
rename()
```
Notably, rename can be used in conjunction with a dict-like object providing new values
for a subset of the axis labels.

rename saves having to copy the DataFrame manually and assign to its index and col
umns attributes. Should you wish to modify a data set in place, pass 
```python
inplace=True
```

In [80]:
#help(DataFrame.rename)

In [78]:

data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [79]:
data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [81]:
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


### Discretization and Binning
Continuous data is often discretized or otherwised separated into “bins” for analysis.
<br><br>Suppose you have data about a group of people in a study, and you want to group them
into discrete age buckets.
<br><br>Let’s divide these into bins of 18 to 25, 26 to 35, 35 to 60, and finally 60 and older. To
do so, you have to use 
```python
cut()
```
, a function in pandas.

In [83]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [97]:
#help(pd.cut)

In [88]:
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

The object pandas returns is a special Categorical object. You can treat it like an array
of strings indicating the bin name; internally it contains a levels array indicating the
distinct category names along with a labeling for the ages data in the labels attribute

In [92]:
cats.codes # showing which values from ages fell into which bin
cats.categories # bins list
pd.value_counts(cats) # how many data fell into which bin

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

Consistent with mathematical notation for intervals, a parenthesis means that the side
is open while the square bracket means it is closed (inclusive). Which side is closed can
be changed by passing 
```python
right=False
```

In [93]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

You can also pass your own bin names by passing a list or array to the labels option:

In [94]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

If you pass cut a integer number of bins instead of explicit bin edges, it will compute
equal-length bins based on the minimum and maximum values in the data. Consider
the case of some uniformly distributed data chopped into fourths:

In [102]:
data = np.random.rand(20)
data
data_cut = pd.cut(data, 4, precision=2)
data_cut
pd.value_counts(data_cut)

array([ 0.0271,  0.6218,  0.3473,  0.2768,  0.0641,  0.3344,  0.0677,
        0.0038,  0.041 ,  0.349 ,  0.092 ,  0.5203,  0.2157,  0.9917,
        0.2672,  0.6802,  0.5262,  0.4399,  0.6616,  0.1348])

[(0.0028, 0.25], (0.5, 0.74], (0.25, 0.5], (0.25, 0.5], (0.0028, 0.25], ..., (0.5, 0.74], (0.5, 0.74], (0.25, 0.5], (0.5, 0.74], (0.0028, 0.25]]
Length: 20
Categories (4, interval[float64]): [(0.0028, 0.25] < (0.25, 0.5] < (0.5, 0.74] < (0.74, 0.99]]

(0.0028, 0.25]    8
(0.25, 0.5]       6
(0.5, 0.74]       5
(0.74, 0.99]      1
dtype: int64

A closely related function, 
```python
qcut()
```
, bins the data based on <font color = 'red'> **sample quantiles**</font>. Depending
on the distribution of the data, using cut will not usually result in each bin having the
same number of data points. Since qcut uses sample quantiles instead, by definition
you will obtain roughly equal-size bins:

In [112]:
data = np.arange(1,11)  # Normally distributed
cats = pd.qcut(data, 4)  # Cut into quartiles
cats
pd.value_counts(cats)

[(0.999, 3.25], (0.999, 3.25], (0.999, 3.25], (3.25, 5.5], (3.25, 5.5], (5.5, 7.75], (5.5, 7.75], (7.75, 10.0], (7.75, 10.0], (7.75, 10.0]]
Categories (4, interval[float64]): [(0.999, 3.25] < (3.25, 5.5] < (5.5, 7.75] < (7.75, 10.0]]

(7.75, 10.0]     3
(0.999, 3.25]    3
(5.5, 7.75]      2
(3.25, 5.5]      2
dtype: int64

In [116]:
#Similar to cut you can pass your own quantiles (numbers between 0 and 1, inclusive):
data
own = pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
own
pd.value_counts(own)


array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

[(0.999, 1.9], (1.9, 5.5], (1.9, 5.5], (1.9, 5.5], (1.9, 5.5], (5.5, 9.1], (5.5, 9.1], (5.5, 9.1], (5.5, 9.1], (9.1, 10.0]]
Categories (4, interval[float64]): [(0.999, 1.9] < (1.9, 5.5] < (5.5, 9.1] < (9.1, 10.0]]

(5.5, 9.1]      4
(1.9, 5.5]      4
(9.1, 10.0]     1
(0.999, 1.9]    1
dtype: int64

In [104]:
help(pd.qcut)

Help on function qcut in module pandas.core.reshape.tile:

qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise')
    Quantile-based discretization function. Discretize variable into
    equal-sized buckets based on rank or based on sample quantiles. For example
    1000 values for 10 quantiles would produce a Categorical object indicating
    quantile membership for each data point.
    
    Parameters
    ----------
    x : ndarray or Series
    q : integer or array of quantiles
        Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
        array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles
    labels : array or boolean, default None
        Used as labels for the resulting bins. Must be of the same length as
        the resulting bins. If False, return only integer indicators of the
        bins.
    retbins : bool, optional
        Whether to return the (bins, labels) or not. Can be useful if bins
        is given as a scalar.
    

### Detecting and Filtering Outliers

Filtering or transforming outliers is largely a matter of applying array operations. Consider
a DataFrame with some normally distributed data:

In [162]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.head(2)
data.describe()

Unnamed: 0,0,1,2,3
0,-0.204176,-0.09348,-0.390978,0.653939
1,0.523357,2.935579,-2.040094,0.978969


Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.032085,-0.009982,0.006094,0.010349
std,1.032679,1.013674,0.987844,1.051487
min,-3.035726,-2.955368,-3.042861,-3.226513
25%,-0.743056,-0.725008,-0.674743,-0.67013
50%,-0.037771,-0.002844,-0.017898,-0.007999
75%,0.69352,0.680148,0.687501,0.697741
max,3.101467,3.442431,3.843404,3.282536


In [163]:
#Suppose you wanted to find values in one of the columns exceeding three in magnitude:
col = data[3]
col[np.abs(col) > 3] # absolut value, wartość bezwzględna

115    3.225255
411   -3.169195
686   -3.226513
873    3.282536
Name: 3, dtype: float64

In [121]:
#help(np.abs)

In [123]:
#To select all rows having a value exceeding 3 or -3, you can use the any method on a boolean DataFrame:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
49,-3.018842,-0.298748,0.406954,0.183282
73,0.781753,-0.555434,-0.048478,-3.108915
204,-3.183867,1.050471,-1.042736,1.680374
251,-3.140963,-1.509976,-0.389818,-0.273253
416,1.090038,-0.848098,-3.194414,0.077839
446,0.003349,-0.011807,3.02372,-1.105312
475,0.452649,-3.481593,0.789944,1.737746
644,3.082067,-0.516982,0.251909,-0.029354
672,3.18994,0.070978,0.516982,-0.805171
836,-0.436479,0.901529,-3.044612,-1.19398


In [166]:
#Values can just as easily be set based on these criteria. Here is code to cap values outside the interval -3 to 3:

#change here iloc values for the ones that were detected as having absolut value >3
data[np.abs(data) > 3].iloc[110:120]
#o = np.sign(data)*3
#o

data[np.abs(data) > 3] = np.sign(data) * 3 # sign() gives values -1 or 1 depend what was the sign of origin value
data.iloc[110:120]
data.describe()

Unnamed: 0,0,1,2,3
110,,,,
111,,,,
112,,,,
113,,,,
114,,,,
115,,,,
116,,,,
117,,,,
118,,,,
119,,,,


Unnamed: 0,0,1,2,3
110,1.021659,2.549872,0.889216,-0.596632
111,-0.89962,0.869046,-0.621424,0.447392
112,0.307316,-0.397873,-1.158512,-1.79249
113,0.754757,1.338804,0.744632,0.677933
114,1.248428,-0.576074,0.25379,-2.298313
115,0.604347,0.126216,0.482631,3.0
116,1.172423,1.229151,0.381337,-0.563881
117,2.352632,-0.691585,-1.370095,1.729922
118,1.23727,-0.423721,-0.19736,-0.063511
119,1.507995,-0.358892,-0.09845,0.302474


Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.03215,-0.010425,0.005293,0.010237
std,1.032272,1.012262,0.984789,1.048804
min,-3.0,-2.955368,-3.0,-3.0
25%,-0.743056,-0.725008,-0.674743,-0.67013
50%,-0.037771,-0.002844,-0.017898,-0.007999
75%,0.69352,0.680148,0.687501,0.697741
max,3.0,3.0,3.0,3.0


In [124]:
#The ufunc np.sign returns an array of 1 and -1 depending on the sign of the values.
help(np.sign)

Help on ufunc object:

sign = class ufunc(builtins.object)
 |  Functions that operate element by element on whole arrays.
 |  
 |  To see the documentation for a specific ufunc, use `info`.  For
 |  example, ``np.info(np.sin)``.  Because ufuncs are written in C
 |  (for speed) and linked into Python with NumPy's ufunc facility,
 |  Python's help() function finds this page whenever help() is called
 |  on a ufunc.
 |  
 |  A detailed explanation of ufuncs can be found in the docs for :ref:`ufuncs`.
 |  
 |  Calling ufuncs:
 |  
 |  op(*x[, out], where=True, **kwargs)
 |  Apply `op` to the arguments `*x` elementwise, broadcasting the arguments.
 |  
 |  The broadcasting rules are:
 |  
 |  * Dimensions of length 1 may be prepended to either array.
 |  * Arrays may be repeated along dimensions of length 1.
 |  
 |  Parameters
 |  ----------
 |  *x : array_like
 |      Input arrays.
 |  out : ndarray, None, or tuple of ndarray and None, optional
 |      Alternate array object(s) in which t

The ufunc 
```python
np.sign()
```returns an array of 1 and -1 depending on the sign of the values.

In [167]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,-1.0,-1.0,-1.0,1.0
1,1.0,1.0,-1.0,1.0
2,-1.0,-1.0,-1.0,1.0
3,-1.0,1.0,1.0,-1.0
4,-1.0,-1.0,-1.0,-1.0


### Permutation and Random Sampling

In [None]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
sampler

In [None]:
df
df.take(sampler)

In [None]:
df.sample(n=3)

In [None]:
choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)
draws

### Computing Indicator/Dummy Variables

In [None]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
pd.get_dummies(df['key'])

In [None]:
dummies = pd.get_dummies(df['key'], prefix='key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

In [None]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('datasets/movielens/movies.dat', sep='::',
                       header=None, names=mnames)
movies[:10]

In [None]:
all_genres = []
for x in movies.genres:
    all_genres.extend(x.split('|'))
genres = pd.unique(all_genres)

In [None]:
genres

In [None]:
zero_matrix = np.zeros((len(movies), len(genres)))
dummies = pd.DataFrame(zero_matrix, columns=genres)

In [None]:
gen = movies.genres[0]
gen.split('|')
dummies.columns.get_indexer(gen.split('|'))

In [None]:
for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1

In [None]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.iloc[0]

In [None]:
np.random.seed(12345)
values = np.random.rand(10)
values
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))

## String Manipulation

### String Object Methods

In [None]:
val = 'a,b,  guido'
val.split(',')

In [None]:
pieces = [x.strip() for x in val.split(',')]
pieces

In [None]:
first, second, third = pieces
first + '::' + second + '::' + third

In [None]:
'::'.join(pieces)

In [None]:
'guido' in val
val.index(',')
val.find(':')

In [None]:
val.index(':')

In [None]:
val.count(',')

In [None]:
val.replace(',', '::')
val.replace(',', '')

### Regular Expressions

In [None]:
import re
text = "foo    bar\t baz  \tqux"
re.split('\s+', text)

In [None]:
regex = re.compile('\s+')
regex.split(text)

In [None]:
regex.findall(text)

In [None]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)

In [None]:
regex.findall(text)

In [None]:
m = regex.search(text)
m
text[m.start():m.end()]

In [None]:
print(regex.match(text))

In [None]:
print(regex.sub('REDACTED', text))

In [None]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)

In [None]:
m = regex.match('wesm@bright.net')
m.groups()

In [None]:
regex.findall(text)

In [None]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

### Vectorized String Functions in pandas

In [None]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data
data.isnull()

In [None]:
data.str.contains('gmail')

In [None]:
pattern
data.str.findall(pattern, flags=re.IGNORECASE)

In [None]:
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches

In [None]:
matches.str.get(1)
matches.str[0]

In [None]:
data.str[:5]

In [None]:
pd.options.display.max_rows = PREVIOUS_MAX_ROWS

## Conclusion

https://www.datacamp.com/community/tutorials/pandas-tutorial-dataframe-python

https://tryolabs.com/blog/2017/03/16/pandas-seaborn-a-guide-to-handle-visualize-data-elegantly/