In [1]:
import pandas as pd
import numpy as np
from collections import OrderedDict

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

from catboost import Pool, CatBoostClassifier

In [2]:
def read_and_preprocess(filepath):
    
    # initial reading
    data = pd.read_csv(filepath, index_col=0)
    
    # move species from index to the table
    data["species"] = data.index
    
    # create numeric idex
    data.index = range(len(data))
    
    # create categorical feature from Phylum (string)
    data["Phylum_Numeric"] = LabelEncoder().fit_transform(data["Phylum"].tolist())
    
    # remove features we will not use
    data = data.drop(["Phylum", "species", "occurrences", "NoSpecies", 
                      "C_Cnumeric", "SC_Numeric", "MaxD_Numeric", "System_Numeric"], axis=1)
    
    # create features and target dataframes
    features = data.drop(["extinct"], axis=1)
    target = data["extinct"]
    
    # create lists with categorical and continious features' names
    continious_cols = []
    categorical_cols = features.columns.drop(continious_cols).tolist()
    
    # make list of indexes
    categorical_idx = [features.columns.tolist().index(col) for col in categorical_cols]
        
    return features, target, categorical_idx

In [3]:
def model_me(features, target, cat_idx):
    
    # prepare data for modeling
    #features, target, cat_idx = read_and_preprocess(filepath)
    
    # create cross-validation instance
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    
    # create holders for scores and feature importances
    cv_scores = []
    feature_importances = []
    
    # loop over different validation splits and save results
    for train_idx, test_idx in cv.split(features, target):
        
        # Gradient boosting model instance
        model = CatBoostClassifier(loss_function="Logloss", random_seed=0)
        
        # Create Pool data classes for train/test
        pool_train = Pool(features.iloc[train_idx, :], target[train_idx], cat_features=cat_idx)
        pool_test = Pool(features.iloc[test_idx, :], target[test_idx], cat_features=cat_idx)
        
        # Train model
        model.fit(pool_train, verbose=False)
        
        # save score from the individual split
        cv_scores.append(roc_auc_score(target[test_idx], model.predict_proba(pool_test)[:, 1]))
        
        # save feature importances from the individual split
        feature_importances.append(model.feature_importances_)
    
    cv_scores = pd.DataFrame(np.array(cv_scores),
                             index=["Split1", "Split2", "Split3", "Split4", "Split5"],
                             columns=["AUC"])
    
    feature_importances = pd.DataFrame(np.array(feature_importances), 
                                       index=["Split1", "Split2", "Split3", "Split4", "Split5"], 
                                       columns=features.columns)
    
    return cv_scores, feature_importances

In [4]:
def RFE(filepath):
    
    features, target, cat_ids = read_and_preprocess(filepath)
    
    results = OrderedDict()
    
    for ffe_idx, ffe in enumerate(features.columns.tolist()):
        
        features_wo_eliminated = features.drop([ffe], axis=1)
        cat_ids = [i for i in range(len(features_wo_eliminated.columns))]        
        
        aucs, fis = model_me(features_wo_eliminated, target, cat_ids)
        
        results[ffe] = OrderedDict({"AUC": aucs.mean(), "FI": fis.mean()})
    
    return results

In [5]:
%%time

# loop over individual Time Intervals
for i in range(1, 5):
    
    # obtain recursive feature elimination results separately
    # for all the Time Intervals considered
    rfe = RFE(f"../data/TimeInterval{i}.csv")
    
    # save results
    np.save(f"../results/TimeInterval{i}_RFE.npy", rfe)

CPU times: user 2h 44min 10s, sys: 28min 35s, total: 3h 12min 45s
Wall time: 28min 32s


In [6]:
def rfe2df(TIN):
      
    df = pd.DataFrame()
    
    df["None"] = pd.read_csv(f"../results/TimeInterval{TIN}_FI.csv", index_col=0).mean().sort_values(ascending=False).iloc[:5].index.tolist() + \
    [pd.read_csv(f"../results/TimeInterval{TIN}_AUC.csv", index_col=0).mean().values[0]]
    
    rfe_instance = np.load(f"../results/TimeInterval{TIN}_RFE.npy").item()
    
    for k, v in rfe_instance.items():
        
        df[k] = v["FI"].sort_values(ascending=False).iloc[:5].index.tolist() + [v["AUC"].values[0]]
        df.index = ["rank1", "rank2", "rank3", "rank4", "rank5", "AUC"]
    
    return df

In [7]:
for i in range(1, 5):
    
    print(f"TimeInterval{i}")
    print(rfe2df(i))
    print("################################################################################")

TimeInterval1
                 None       K_Numeric     Min_Numeric       C_Numeric  \
rank1    MinD_Numeric    MinD_Numeric    MinD_Numeric    MinD_Numeric   
rank2     Min_Numeric     Min_Numeric  Phylum_Numeric     Min_Numeric   
rank3  Phylum_Numeric       O_Numeric       O_Numeric       O_Numeric   
rank4       O_Numeric  Phylum_Numeric       R_Numeric  Phylum_Numeric   
rank5       C_Numeric       C_Numeric       S_Numeric       R_Numeric   
AUC          0.677653        0.700527        0.698006        0.702218   

            S_Numeric       O_Numeric       T_Numeric       M_Numeric  \
rank1    MinD_Numeric    MinD_Numeric    MinD_Numeric    MinD_Numeric   
rank2     Min_Numeric  Phylum_Numeric     Min_Numeric       O_Numeric   
rank3       O_Numeric     Min_Numeric       O_Numeric     Min_Numeric   
rank4  Phylum_Numeric       R_Numeric  Phylum_Numeric  Phylum_Numeric   
rank5       C_Numeric       S_Numeric       C_Numeric       T_Numeric   
AUC          0.696324        0.68526