In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

from catboost import Pool, CatBoostClassifier

In [2]:
def read_and_preprocess(filepath):
    
    # initial reading
    data = pd.read_csv(filepath, index_col=0)
    
    # move species from index to the table
    data["species"] = data.index
    
    # create numeric idex
    data.index = range(len(data))
    
    # create categorical feature from Phylum (string)
    data["Phylum_Numeric"] = LabelEncoder().fit_transform(data["Phylum"].tolist())
    
    # remove features we will not use
    data = data.drop(["Phylum", "species", "occurrences", "NoSpecies", 
                      "C_Cnumeric", "SC_Numeric", "MaxD_Numeric", "System_Numeric"], axis=1)
    
    # create features and target dataframes
    features = data.drop(["extinct"], axis=1)
    target = data["extinct"]
    
    # create lists with categorical and continious features' names
    continious_cols = []
    categorical_cols = features.columns.drop(continious_cols).tolist()
    
    # make list of indexes
    categorical_idx = [features.columns.tolist().index(col) for col in categorical_cols]
        
    return features, target, categorical_idx

In [3]:
def model_me(filepath):
    
    # prepare data for modeling
    features, target, cat_idx = read_and_preprocess(filepath)
    
    # create cross-validation instance
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    
    # create holders for scores and feature importances
    holder = []
    
    # loop over different validation splits and save results
    for i, (train_idx, test_idx) in enumerate(cv.split(features, target)):
        
        # Gradient boosting model instance
        model = CatBoostClassifier(loss_function="Logloss", random_seed=0)
        
        # Create Pool data classes for train/test
        pool_train = Pool(features.iloc[train_idx, :], target[train_idx], cat_features=cat_idx)
        pool_test = Pool(features.iloc[test_idx, :], target[test_idx], cat_features=cat_idx)
        
        # Train model
        model.fit(pool_train, verbose=False)
        
        # predict class
        target_pred = model.predict(pool_test)
        
        # predict probability
        target_pred_proba = model.predict_proba(pool_test)[:, 1]
        
        # combine resulting prediction
        test_prediction = pd.DataFrame(features.iloc[test_idx, :], 
                                       columns=features.columns, 
                                       index=test_idx)
        # add target...
        test_prediction["extinct"] = target[test_idx]
        # class predictions...
        test_prediction["extinct_pred"] = target_pred
        # and their probability
        test_prediction["extinct_pred_proba"] = target_pred_proba
        
        # put a Split number pointer
        test_prediction["Split_num"] = i+1
        
        holder.append(test_prediction)
        
    
    test_prediction_df = pd.concat(holder, axis=0)
    
    return test_prediction_df

In [4]:
%%time

# loop over individual Time Intervals
for i in range(1, 5):
    
    # obtain scores and feature importances using cross-validation
    test_predictions = model_me(f"../data/TimeInterval{i}.csv")
    
    # save results to .csv
    test_predictions.to_csv(f"../results/test_predictions/CGB_TimeInterval{i}.csv")
    
    # AUC
    auc = roc_auc_score(test_predictions["extinct"], test_predictions["extinct_pred_proba"])
    
    print(f"TimeInterval{i}: AUC {np.round(auc, 2)}")

TimeInterval1: AUC 0.68
TimeInterval2: AUC 0.76
TimeInterval3: AUC 0.75
TimeInterval4: AUC 0.7
CPU times: user 4min 56s, sys: 43.7 s, total: 5min 39s
Wall time: 1min 5s
