1. Imputation

In [None]:
threshold = 0.7
#Dropping columns with missing value rate higher than threshold
data = data[data.columns[data.isnull().mean() < threshold]]

#Dropping rows with missing value rate higher than threshold
data = data.loc[data.isnull().mean(axis=1) < threshold]

In [None]:
#Filling all missing values with 0
data = data.fillna(0)
#Filling missing values with medians of the columns
data = data.fillna(data.median())

In [None]:
#Max fill function for categorical columns
data['column_name'].fillna(data['column_name'].value_counts().idxmax(), inplace=True) 
''' Replacing the missing values with the maximum occurred value in a column is a good option for handling categorical columns. But if you think the values in the column are distributed uniformly and there is not a dominant value, imputing a category like “Other” might be more sensible'''

In [None]:
#Dropping the outlier rows with standard deviation
factor = 3
upper_lim = data['column'].mean () + data['column'].std () * factor
lower_lim = data['column'].mean () - data['column'].std () * factor

data = data[(data['column'] < upper_lim) & (data['column'] > lower_lim)]

In [None]:
#Dropping the outlier rows with Percentiles
upper_lim = data['column'].quantile(.95)
lower_lim = data['column'].quantile(.05)

data = data[(data['column'] < upper_lim) & (data['column'] > lower_lim)]

In [None]:
#Capping the outlier rows with Percentiles
upper_lim = data['column'].quantile(.95)
lower_lim = data['column'].quantile(.05)
data.loc[(df[column] > upper_lim),column] = upper_lim
data.loc[(df[column] < lower_lim),column] = lower_lim

#### Binning can be applied on both categorical and numerical data:
#####  Numerical Binning Example
Value      Bin       
0-30   ->  Low       
31-70  ->  Mid       
71-100 ->  High
#####  Categorical Binning Example
Value      Bin       
Spain  ->  Europe      
Italy  ->  Europe       
Chile  ->  South America
Brazil ->  South America

Every time you bin something, you sacrifice information and make your data more regularized.

In [None]:
#Numerical Binning Example
data['bin'] = pd.cut(data['value'], bins=[0,30,70,100], labels=["Low", "Mid", "High"])
'''   value   bin
0      2   Low
1     45   Mid
2      7   Low
3     85  High
4     28   Low'''
#Categorical Binning Example
'''     Country
0      Spain
1      Chile
2  Australia
3      Italy
4     Brazil'''
conditions = [
    data['Country'].str.contains('Spain'),
    data['Country'].str.contains('Italy'),
    data['Country'].str.contains('Chile'),
    data['Country'].str.contains('Brazil')]

choices = ['Europe', 'Europe', 'South America', 'South America']

data['Continent'] = np.select(conditions, choices, default='Other')
'''     Country      Continent
0      Spain         Europe
1      Chile  South America
2  Australia          Other
3      Italy         Europe
4     Brazil  South America'''

## Log Transform

In [None]:
#Log Transform Example
data = pd.DataFrame({'value':[2,45, -23, 85, 28, 2, 35, -12]})
data['log+1'] = (data['value']+1).transform(np.log)
#Negative Values Handling
#Note that the values are different
data['log'] = (data['value']-data['value'].min()+1) .transform(np.log)
'''   value  log(x+1)  log(x-min(x)+1)
0      2   1.09861          3.25810
1     45   3.82864          4.23411
2    -23       nan          0.00000
3     85   4.45435          4.69135
4     28   3.36730          3.95124
5      2   1.09861          3.25810
6     35   3.58352          4.07754
7    -12       nan          2.48491'''

### One-hot encoding 
This method spreads the values in a column to multiple flag columns and assigns 0 or 1 to them. These binary values express the relationship between grouped and encoded column.

get_dummies function of Pandas. This function maps all values in a column to multiple columns

In [None]:
encoded_columns = pd.get_dummies(data['column'])
data = data.join(encoded_columns).drop('column', axis=1)

#### Grouping Operations

In [None]:
#  Categorical Column Grouping  - first option is to select the label with the highest frequency. In other words, this is the max operation for categorical columns, but ordinary max functions generally do not return this value, you need to use a lambda function for this purpose.
data.groupby('id').agg(lambda x: x.value_counts().index[0])

In [None]:
'''Second option is to make a pivot table. This approach resembles the encoding method in the 
preceding step with a difference. Instead of binary notation, it can be defined as aggregated functions for the values
between grouped and encoded columns. This would be a good 
option if you aim to go beyond binary flag columns and merge 
multiple features into aggregated features, which are more informative'''
#Pivot table Pandas Example
data.pivot_table(index='column_to_group', columns='column_to_encode', values='aggregation_column', aggfunc=np.sum, fill_value = 0)

#### Numerical Column Grouping
use sum and mean functions in most of the cases

In [None]:
#sum_cols: List of columns to sum
#mean_cols: List of columns to average
grouped = data.groupby('column_to_group')

sums = grouped[sum_cols].sum().add_suffix('_sum')
avgs = grouped[mean_cols].mean().add_suffix('_avg')

new_df = pd.concat([sums, avgs], axis=1)

### Feature Split

In [None]:
data.name
'''0  Luther N. Gonzalez
1    Charles M. Young
2        Terry Lawson
3       Kristen White
4      Thomas Logsdon'''
#Extracting first names
data.name.str.split(" ").map(lambda x: x[0])
'''0     Luther
1    Charles
2      Terry
3    Kristen
4     Thomas'''
#Extracting last names
data.name.str.split(" ").map(lambda x: x[-1])
'''0    Gonzalez
1       Young
2      Lawson
3       White
4     Logsdon'''

In [None]:
#String extraction example
data.title.head()
'''0                      Toy Story (1995)
1                        Jumanji (1995)
2               Grumpier Old Men (1995)
3              Waiting to Exhale (1995)
4    Father of the Bride Part II (1995)'''
data.title.str.split("(", n=1, expand=True)[1].str.split(")", n=1, expand=True)[0]
'''0    1995
1    1995
2    1995
3    1995
4    1995'''

In [3]:
# l = 'Toy Story (1995)','Jumanji (1995)', 'Grumpier Old Men (1995)', 'Waiting to Exhale (1995)','Father of the Bride Part II (1995']
l = 'Toy Story (1995), Jumanji (1995), Grumpier Old Men (1995), Waiting to Exhale (1995),  Father of the Bride Part II (1995)'

In [None]:
l2 = l.split(   "(", n=1, expand=True)[1].str.split(")", n=1, expand=True )

### Scaling      
      two common ways of scaling:

##### Normalization      between 0 and 1 
Xnorm = X - Xmin / Xmax - Xmin
##### Standardization (or z-score normalization) scales the values while taking into account standard deviation
z = x - mean / std

In [None]:
data = pd.DataFrame({'value':[2,45, -23, 85, 28, 2, 35, -12]})
data['normalized'] = (data['value'] - data['value'].min()) / (data['value'].max() - data['value'].min())
'''   value  normalized
0      2        0.23
1     45        0.63
2    -23        0.00
3     85        1.00
4     28        0.47
5      2        0.23
6     35        0.54
7    -12        0.10'''

In [None]:
data = pd.DataFrame({'value':[2,45, -23, 85, 28, 2, 35, -12]})
data['standardized'] = (data['value'] - data['value'].mean()) / data['value'].std()
'''   value  standardized
0      2         -0.52
1     45          0.70
2    -23         -1.23
3     85          1.84
4     28          0.22
5      2         -0.52
6     35          0.42
7    -12         -0.92'''

In [None]:
l = [1, 5 , 60, 44, 11, -7, -46, 55]
def norm(n, l):
    lmin = min(l); lmax = max(l); print('Lmin and lmax are: ', lmin, lmax)
    up = n - lmin; print('Up: ', up)
    down = lmax - lmin; print('down: ', down)
    res = up / down; print('res = ', res)
    return res
j = [norm(n, l) for n in l] 

### Extracting Date

In [None]:
from datetime import date

data = pd.DataFrame({'date':
['01-01-2017',
'04-12-2008',
'23-06-1988',
'25-08-1999',
'20-02-1993',
]})

#Transform string to date
data['date'] = pd.to_datetime(data.date, format="%d-%m-%Y")

#Extracting Year
data['year'] = data['date'].dt.year

#Extracting Month
data['month'] = data['date'].dt.month

#Extracting passed years since the date
data['passed_years'] = date.today().year - data['date'].dt.year

#Extracting passed months since the date
data['passed_months'] = (date.today().year - data['date'].dt.year) * 12 + date.today().month - data['date'].dt.month

#Extracting the weekday name of the date
data['day_name'] = data['date'].dt.day_name()

'''   date    year  month  passed_years  passed_months   day_name
0 2017-01-01  2017      1             2             26     Sunday
1 2008-12-04  2008     12            11            123   Thursday
2 1988-06-23  1988      6            31            369   Thursday
3 1999-08-25  1999      8            20            235  Wednesday
4 1993-02-20  1993      2            26            313   Saturday'''