In [1]:
# load libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from feature_engine.imputation import MeanMedianImputer
from feature_engine.encoding import (
    OneHotEncoder,
    OrdinalEncoder
)
from feature_engine.selection import DropFeatures

import feature_engineering_functions as fef


pd.pandas.set_option('display.max_columns', None)

In [18]:
# load dataset
frog = pd.read_excel('train.xlsx')
frog_data = frog

In [19]:
frog_data = frog
frog_data.head(3)

Unnamed: 0,Column1,gbifID,eventDate,Time,country,continent,stateProvince,decimalLatitude,decimalLongitude,species,occurrenceStatus,ppt_mean,soil_mean,tmax_mean,tmin_mean
0,0,2574007008,1/23/2020,01:38:00,Australia,Australia,New South Wales,-32.719457,152.159267,Litoria Fallax,1,96.0,118.71667,23.830004,13.908336
1,1,3457021422,3/14/2010,22:23:00,Costa Rica,Central America,Puntarenas,8.496999,-83.318613,Agalychnis Callidryas,0,231.51666,131.88333,31.841667,21.111668
2,2,1571195309,2014-11-04 19:51:00,,Costa Rica,Central America,Heredia,10.450801,-84.068659,Agalychnis Callidryas,0,279.2,51.7,31.341671,22.455004


In [20]:
# split data into train_set and test_set
x_train, x_test, y_train, y_test = train_test_split(
                                                frog_data.drop('occurrenceStatus', axis = 1),
                                                frog_data['occurrenceStatus'],
                                                test_size = 0.15,
                                                random_state = 124

)

### Feature Engineering CONFIG

In [21]:
# configuration for feature engineering
REPLACER_DATE = ['eventDate']

REPLACE_COUNTRY = ['country']

REPLACE_CONTINENT = ['continent']

MEDIAN_REPLACEMENT = ['ppt_mean', 'soil_mean', 'tmax_mean', 'tmin_mean']

ONE_HOT_ENCODE = ['continent']

LABEL_ENCODE = ['country']

CONVERT_DATE = ['eventDate']

EXTRACT_MONTH = ['eventDate']

FILL_MONTH = ['Month']

DROP_COLUMNS = ['Column1', 'eventDate', 'Time', 'gbifID',  'stateProvince', 'decimalLatitude', 'decimalLongitude', 'species']

In [22]:
 # create pipeline for processing data
frog_pipe = Pipeline([
                    # replace value in column with another value
                ('date_replacer', fef.ValueReplacer(
                    variables = REPLACER_DATE, original_value = '###############################################################################################################################################################################################################################################################', 
                    replacer = '8/15/1973'
                )),
                    # replace value in column with another value
                ('country_replacer', fef.ValueReplacer(
                    variables = REPLACE_COUNTRY, original_value = 'unknown or invalid', replacer = 'Angola'
                )),
                    # replace value in column with another value
                ('countinent_replacer', fef.ValueReplacer(
                    variables = REPLACE_CONTINENT, original_value = 'unknown or invalid', replacer = 'Africa'
                )),
                    # median imputer should replace all null values with median of each column
                ('median_imputer', MeanMedianImputer(
                    imputation_method = 'median', variables = MEDIAN_REPLACEMENT
                )),
                    # one-hot encode the continents
                ('one_hot_encoder', OneHotEncoder(
                    variables = ONE_HOT_ENCODE
                )),
                ('label_encode', OrdinalEncoder(
                    encoding_method = 'arbitrary', variables =  LABEL_ENCODE
                )),
                ('drop_features', DropFeatures(
                    features_to_drop = DROP_COLUMNS
                ))
])

In [23]:
# apply pipeline on train_set
frog_pipe.fit(x_train, y_train)

In [24]:
# checked transformed data
man = frog_pipe.transform(x_train)

In [25]:
man

Unnamed: 0,country,ppt_mean,soil_mean,tmax_mean,tmin_mean,continent_Australia,continent_Central America,continent_Africa
47843,0,64.600000,47.416668,22.484999,10.981668,1,0,0
58175,0,72.133330,61.533333,23.651667,12.553335,1,0,0
77239,0,64.600000,47.416668,22.484999,10.981668,1,0,0
135855,0,64.600000,47.416668,22.484999,10.981668,1,0,0
106425,0,105.083336,113.766670,25.160004,12.336668,1,0,0
...,...,...,...,...,...,...,...,...
10702,0,67.650000,43.233334,24.130001,12.366668,1,0,0
49436,0,35.233334,9.950000,22.295000,11.506667,1,0,0
5652,0,64.600000,47.416668,22.484999,10.981668,1,0,0
90247,0,96.266670,100.433334,24.583334,13.163334,1,0,0
