# 1. Discretization and Binning

In [1]:
import numpy as np
import pandas as pd

In [2]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]

In [3]:
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [4]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [5]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [6]:
pd.value_counts(cats)

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [7]:
#exclusive-inclusive side change
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

## 1.1 Naming bucketized groups

In [8]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

## 1.2 Bucketizing by min/max value

In [9]:
data = np.random.rand(20)

pd.cut(data, 4, precision=2)

[(0.26, 0.5], (0.74, 0.99], (0.018, 0.26], (0.26, 0.5], (0.5, 0.74], ..., (0.74, 0.99], (0.26, 0.5], (0.74, 0.99], (0.26, 0.5], (0.74, 0.99]]
Length: 20
Categories (4, interval[float64]): [(0.018, 0.26] < (0.26, 0.5] < (0.5, 0.74] < (0.74, 0.99]]

## 1.3 Equal size buckets via qcut()

In [10]:
data = np.random.randn(1000)

cats = pd.qcut(data, 4)
cats

[(0.696, 3.097], (0.0133, 0.696], (0.0133, 0.696], (-3.58, -0.684], (0.696, 3.097], ..., (-0.684, 0.0133], (0.0133, 0.696], (0.696, 3.097], (0.0133, 0.696], (-0.684, 0.0133]]
Length: 1000
Categories (4, interval[float64]): [(-3.58, -0.684] < (-0.684, 0.0133] < (0.0133, 0.696] < (0.696, 3.097]]

In [11]:
pd.value_counts(cats)

(-3.58, -0.684]     250
(-0.684, 0.0133]    250
(0.0133, 0.696]     250
(0.696, 3.097]      250
dtype: int64

### 1.4 Custom quantiles

In [12]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1])

[(1.253, 3.097], (0.0133, 1.253], (0.0133, 1.253], (-1.299, 0.0133], (1.253, 3.097], ..., (-1.299, 0.0133], (0.0133, 1.253], (0.0133, 1.253], (0.0133, 1.253], (-1.299, 0.0133]]
Length: 1000
Categories (4, interval[float64]): [(-3.58, -1.299] < (-1.299, 0.0133] < (0.0133, 1.253] < (1.253, 3.097]]

# 2. Detecting and Filtering Outliers

In [13]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.015296,0.034508,0.01246,0.049196
std,1.023661,0.990086,1.032473,1.019041
min,-3.828131,-3.55985,-3.499336,-3.043594
25%,-0.658401,-0.598636,-0.666979,-0.617029
50%,0.016474,0.027209,-0.036962,0.027012
75%,0.682193,0.700943,0.741667,0.746015
max,3.422943,2.810374,3.423835,2.989189


In [14]:
#finding only in one column
column = data[2]
column[np.abs(column) > 3]

30    -3.499336
194    3.104927
209    3.303317
316    3.423835
432   -3.067738
474    3.243458
Name: 2, dtype: float64

In [15]:
#all the columns
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
30,0.045529,0.257113,-3.499336,-0.069027
165,3.422943,-0.527333,0.981619,-0.830064
194,-0.407177,0.3009,3.104927,1.002028
209,-0.293882,2.193524,3.303317,-0.14664
316,0.652863,0.013865,3.423835,-0.306621
390,0.326917,-3.55985,-1.987751,2.246553
432,-0.766271,1.038995,-3.067738,-0.910965
471,-3.202119,0.10201,-0.536965,-0.875858
474,-0.480586,-0.633194,3.243458,0.807053
588,-0.418538,-3.232045,0.019281,-1.21827


## 2.1 Assigning some constant value to outliers

In [16]:
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.013876,0.035299,0.011951,0.049239
std,1.016447,0.987467,1.0274,1.01891
min,-3.0,-3.0,-3.0,-3.0
25%,-0.658401,-0.598636,-0.666979,-0.617029
50%,0.016474,0.027209,-0.036962,0.027012
75%,0.682193,0.700943,0.741667,0.746015
max,3.0,2.810374,3.0,2.989189


In [17]:
#returns -1.0/1.0 depending on sign
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,-1.0,1.0,1.0
1,1.0,-1.0,1.0,-1.0
2,-1.0,1.0,1.0,1.0
3,1.0,-1.0,-1.0,1.0
4,1.0,-1.0,1.0,1.0


# 3. Permutation and Random Sampling
* Permuting = randomly reordering

## 3.1 Permutation

In [18]:
dataframe = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))

sampler = np.random.permutation(5)
sampler

array([4, 2, 0, 3, 1])

In [19]:
dataframe

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [20]:
#randomly permutates the rows, handy if for model training
dataframe.iloc[sampler]

Unnamed: 0,0,1,2,3
4,16,17,18,19
2,8,9,10,11
0,0,1,2,3
3,12,13,14,15
1,4,5,6,7


## 3.2 Random Sampling

In [21]:
#withOUT replacement
dataframe.sample(n=3)

Unnamed: 0,0,1,2,3
0,0,1,2,3
4,16,17,18,19
2,8,9,10,11


In [22]:
#with replacement, replace=True
#using one instance many times in sample

choices = pd.Series([5, -7, 1, 6, 4])
draws = choices.sample(n=10, replace=True)
draws

4    4
3    6
2    1
0    5
2    1
1   -7
4    4
1   -7
0    5
0    5
dtype: int64

# 4. Computing Indicator/Dummy Variables

In [23]:
dataframe = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                         'data1': np.arange(6)})
dataframe

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [24]:
pd.get_dummies(dataframe['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


## 4.1 Prerixing dummy columns

In [25]:
dummies = pd.get_dummies(dataframe['key'], prefix='key')

df_with_dummies = dataframe[['data1']].join(dummies)
df_with_dummies

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [26]:
np.random.seed(12345)

values = np.random.rand(10)
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [27]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0
