In [15]:
# load libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from feature_engine.imputation import MeanMedianImputer
from feature_engine.encoding import (
    OneHotEncoder,
    OrdinalEncoder
)
from feature_engine.selection import DropFeatures


pd.pandas.set_option('display.max_columns', None)

In [2]:
# load dataset
frog_data = pd.read_excel('train.xlsx')
frog_data.head()

Unnamed: 0,Column1,gbifID,eventDate,Time,country,continent,stateProvince,decimalLatitude,decimalLongitude,species,occurrenceStatus,ppt_mean,soil_mean,tmax_mean,tmin_mean
0,0,2574007008,1/23/2020,01:38:00,Australia,Australia,New South Wales,-32.719457,152.159267,Litoria Fallax,1,96.0,118.71667,23.830004,13.908336
1,1,3457021422,3/14/2010,22:23:00,Costa Rica,Central America,Puntarenas,8.496999,-83.318613,Agalychnis Callidryas,0,231.51666,131.88333,31.841667,21.111668
2,2,1571195309,2014-11-04 19:51:00,,Costa Rica,Central America,Heredia,10.450801,-84.068659,Agalychnis Callidryas,0,279.2,51.7,31.341671,22.455004
3,3,2265778539,7/21/2018,21:24:00,Costa Rica,Central America,Puntarenas,9.756705,-84.613363,Agalychnis Callidryas,0,192.1,171.81667,31.043331,20.06167
4,4,3407962690,10/29/2021,13:57:00,Australia,Australia,Queensland,-26.714302,152.815096,Litoria Fallax,1,84.8,111.61667,25.933338,13.801668


In [8]:
# split data into train_set and test_set
x_train, x_test, y_train, y_test = train_test_split(
                                                frog_data.drop('occurrenceStatus', axis = 1),
                                                frog_data['occurrenceStatus'],
                                                test_size = 0.15,
                                                random_state = 124

)

### Feature Engineering CONFIG

In [12]:
# configuration for feature engineering
REPLACE_DATE = ['eventDate']

REPLACE_COUNTRY = ['country']

MEDIAN_REPLACEMENT = ['ppt_mean', 'soil_mean', 'tmax_mean', 'tmin_mean']

ONE_HOT_ENCODE = ['continent']

LABEL_ENCODE = ['country']

CONVERT_DATE = ['eventDate']

EXTRACT_MONTH = ['eventDate']

FILL_MONTH = ['Month']

DROP_COLUMNS = ['Column1', 'eventDate', 'Time', 'gbifID',  'stateProvince', 'decimalLatitude', 'decimalLongitude', 'species']

In [16]:
# create pipeline for processing data
frog_pipe = Pipeline([
                    # median imputer should replace all null values with median of each column
                ('median_imputer', MeanMedianImputer(
                    imputation_method = 'median', variables = MEDIAN_REPLACEMENT)),
                    # one-hot encode the continents
                ('one_hot_encoder', OneHotEncoder(
                    variables = ONE_HOT_ENCODE)),
                ('label_encode', OrdinalEncoder(
                    encoding_method = 'arbitrary', variables =  LABEL_ENCODE)),
                ('drop_features', DropFeatures(
                    features_to_drop = DROP_COLUMNS
                ))
])

In [17]:
# apply pipeline on train_set
frog_pipe.fit(x_train, y_train)

In [23]:
frog_pipe.feature_names_in_

['Column1',
 'gbifID',
 'eventDate',
 'Time',
 'country',
 'continent',
 'stateProvince',
 'decimalLatitude',
 'decimalLongitude',
 'species',
 'ppt_mean',
 'soil_mean',
 'tmax_mean',
 'tmin_mean']

In [24]:
frog_pipe.named_steps

{'median_imputer': MeanMedianImputer(variables=['ppt_mean', 'soil_mean', 'tmax_mean', 'tmin_mean']),
 'one_hot_encoder': OneHotEncoder(variables=['continent']),
 'label_encode': OrdinalEncoder(encoding_method='arbitrary', variables=['country']),
 'drop_features': DropFeatures(features_to_drop=['Column1', 'eventDate', 'Time', 'gbifID',
                                'stateProvince', 'decimalLatitude',
                                'decimalLongitude', 'species'])}

In [32]:
man = frog_pipe.transform(x_train)

In [44]:
man[man['country'] == 10]

Unnamed: 0,country,ppt_mean,soil_mean,tmax_mean,tmin_mean,continent_Australia,continent_Central America,continent_Africa,continent_unknown or invalid
131145,10,16.066668,3.433333,31.790003,14.610001,0,0,0,1
142904,10,16.066668,3.433333,31.790003,14.610001,0,0,0,1
115605,10,57.716667,25.216667,30.86667,14.325001,0,0,0,1
127570,10,33.066666,6.983333,29.611668,13.668335,0,0,0,1
115721,10,10.583333,0.0,21.725002,9.608335,0,0,0,1
115550,10,37.483334,9.433333,28.808332,12.188335,0,0,0,1
131121,10,16.066668,3.433333,31.790003,14.610001,0,0,0,1
115607,10,24.85,4.533333,30.071672,12.881669,0,0,0,1
115656,10,31.5,5.216667,28.59167,11.600001,0,0,0,1
115576,10,6.233333,0.0,30.110003,13.500001,0,0,0,1
