In [1]:
import warnings
warnings.filterwarnings('ignore')

from catboost import CatBoostClassifier
from concurrent.futures import ThreadPoolExecutor
from lightgbm import LGBMClassifier
import numpy as np

import pandas as pd

import random

from sklearn.ensemble import ExtraTreesClassifier, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import make_scorer, log_loss
from sklearn.model_selection import cross_validate, KFold, StratifiedKFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler

import time
from tqdm.notebook import tqdm

from xgboost import XGBClassifier

pd.set_option('display.max_columns', None)

experiment_name = 'baseline'

In [2]:
train = pd.read_csv('bert_large_uncased_baseline_train.csv')
test = pd.read_csv('bert_large_uncased_baseline_test.csv')

target_data = pd.read_csv('train.csv')['author']

train.shape, test.shape, target_data.shape

((19579, 1024), (8392, 1024), (19579,))

In [3]:
le = LabelEncoder()
le.fit(target_data)
target_data_le = le.transform(target_data)
target_data_le

array([0, 1, 0, ..., 0, 0, 1])

In [4]:
X = train.copy()
y = target_data_le

n_splits = 3
k3 = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=5)

In [6]:
# Define pipelines
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=50))
])

# ridge_pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('nystroem', Nystroem(n_components=500, random_state=5)),
#     ('ridge', Ridge())
# ])

linear_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', LogisticRegression()),
])

# Manually set pipeline names
knn_pipeline.name = 'KNN'
# ridge_pipeline.name = 'Nystroem Ridge'
linear_pipeline.name = 'LR Pipeline'

In [7]:
models = [
    LogisticRegression(),
    linear_pipeline,
    LGBMClassifier(n_jobs=-1, random_state=5),
    XGBClassifier(random_state=5),
    RandomForestClassifier(random_state=5),
    ExtraTreesClassifier(random_state=5),
    HistGradientBoostingClassifier(random_state=5),
    CatBoostClassifier(random_state=5, verbose=False, early_stopping_rounds=100),
    knn_pipeline,
    # ridge_pipeline,
]

In [8]:
def evaluate_models(models, X, y, important_features, cv_split, experiment_name):
    Model_compare = pd.DataFrame(columns=['Model Name', 
                                        'Model Parameters', 
                                        'Model Train Log Loss', 
                                        'Model Test Log Loss', 
                                        'Model Test Log Loss Std', 
                                        'Model Time'])
    
    def evaluate_model(alg, idx):
        if hasattr(alg, 'name'):
            model_name = alg.name
        else:
            model_name = alg.__class__.__name__
        features = important_features.get(model_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {model_name} due to no important features.')
            return {
                'Model Name': model_name,
                'Model Parameters': str(alg.get_params()),
                'Model Train Log Loss': 0,
                'Model Test Log Loss': 0,
                'Model Test Log Loss Std': 0,
                'Model Time': "0 min 0.00 sec",
            }
        
        cv_results = cross_validate(alg, 
                                    X[features], 
                                    y, cv=cv_split, 
                                    scoring='neg_log_loss', 
                                    return_train_score=True, 
                                    n_jobs=-1)

        # Time formatting
        mean_fit_time = cv_results['fit_time'].mean()
        minutes, seconds = divmod(mean_fit_time, 60)

        # Results population
        result = {
            'Model Name': model_name,
            'Model Parameters': str(alg.get_params()),
            'Model Train Log Loss': -cv_results['train_score'].mean(),
            'Model Test Log Loss': -cv_results['test_score'].mean(),
            'Model Test Log Loss Std': cv_results['test_score'].std(),
            'Model Time': f"{int(minutes)} min {seconds:.2f} sec",
        }

        print(f'Done with {model_name}.')
        return result

    results_list = []

    with ThreadPoolExecutor(max_workers=50) as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(tqdm(models, desc='Models'))]
        for future in tqdm(futures, total=len(futures), desc='Progress'):
            result = future.result()
            results_list.append(result)

    model_compare = pd.DataFrame(results_list)

    model_compare.sort_values(by=['Model Test Log Loss'], ascending=True, inplace=True)
    model_compare.to_csv(f'{experiment_name}_results.csv', index=False)

    return model_compare

In [9]:
baseline_features = {}

for model in models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__

    baseline_features[model_name] = list(X.columns)

In [10]:
%%time

baseline_models = evaluate_models(models, X, y, baseline_features, k3, f'{experiment_name}')
baseline_models

Models:   0%|          | 0/9 [00:00<?, ?it/s]

Progress:   0%|          | 0/9 [00:00<?, ?it/s]

Done with LogisticRegression.
Done with ExtraTreesClassifier.
Done with LR Pipeline.
Done with RandomForestClassifier.
Done with LGBMClassifier.
Done with KNN.
Done with HistGradientBoostingClassifier.
Done with CatBoostClassifier.
Done with XGBClassifier.
CPU times: total: 3.16 s
Wall time: 55min 52s


Unnamed: 0,Model Name,Model Parameters,Model Train Log Loss,Model Test Log Loss,Model Test Log Loss Std,Model Time
0,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False...",0.4118485,0.57033,0.005785,0 min 13.85 sec
7,CatBoostClassifier,"{'verbose': False, 'random_state': 5, 'early_s...",0.2032693,0.576469,0.002246,32 min 47.66 sec
6,HistGradientBoostingClassifier,"{'categorical_features': None, 'early_stopping...",0.1611123,0.598627,0.006025,7 min 3.50 sec
2,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.2297708,0.609872,0.003729,3 min 26.15 sec
1,LR Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.39319,0.614112,0.008366,0 min 12.67 sec
3,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...",0.02556679,0.614505,0.008706,54 min 42.47 sec
8,KNN,"{'memory': None, 'steps': [('scaler', Standard...",0.7277288,0.761946,0.004983,0 min 1.22 sec
4,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.2269657,0.839461,0.002836,1 min 59.66 sec
5,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",2.109424e-15,0.855681,0.003275,0 min 39.65 sec


In [None]:
model1 = LogisticRegression()
model2 = linear_pipeline
model3 = LGBMClassifier(n_jobs=-1, random_state=5)
model4 = XGBClassifier(random_state=5)
model5 = ExtraTreesClassifier(random_state=5)
model6 = HistGradientBoostingClassifier(random_state=5)
model7 = CatBoostClassifier(random_state=5, verbose=False, early_stopping_rounds=100)
model8 = knn_pipeline
model9 = RandomForestClassifier(random_state=5)

- Optuna weights

In [None]:
import optuna

# Lists to store predictions and true values
preds1, preds2, preds3, preds4, preds5, preds6, preds7, preds8, preds9, y_vals = [], [], [], [], [], [], [], [], [], []

# Perform cross-validation
for i, (train_index, val_index) in enumerate(k3.split(X)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Fit models on the training fold
    model1.fit(X_train, y_train)
    model2.fit(X_train, y_train)
    model3.fit(X_train, y_train)
    model4.fit(X_train, y_train)
    model5.fit(X_train, y_train)
    model6.fit(X_train, y_train)
    model7.fit(X_train, y_train)
    model8.fit(X_train, y_train)
    model9.fit(X_train, y_train)

    # Make predictions on the validation fold
    pred1 = model1.predict(X_val)
    pred2 = model2.predict(X_val)
    pred3 = model3.predict(X_val)
    pred4 = model4.predict(X_val)
    pred5 = model5.predict(X_val)
    pred6 = model6.predict(X_val)
    pred7 = model7.predict(X_val)
    pred8 = model8.predict(X_val)
    pred9 = model9.predict(X_val)

    # Store predictions and true values
    preds1.append(pred1)
    preds2.append(pred2)
    preds3.append(pred3)
    preds4.append(pred4)
    preds5.append(pred5)
    preds6.append(pred6)
    preds7.append(pred7)
    preds8.append(pred8)
    preds9.append(pred9)
    y_vals.append(y_val)

    print(f'Done with fold {i+1}')

# Convert lists to numpy arrays for easier manipulation
preds1 = np.concatenate(preds1)
preds2 = np.concatenate(preds2)
preds3 = np.concatenate(preds3)
preds4 = np.concatenate(preds4)
preds5 = np.concatenate(preds5)
preds6 = np.concatenate(preds6)
preds7 = np.concatenate(preds7)
preds8 = np.concatenate(preds8)
preds9 = np.concatenate(preds9)
y_vals = np.concatenate(y_vals)

In [None]:
import optuna
from sklearn.metrics import mean_absolute_error

# Define the objective function for Optuna
def objective(trial):
    # List of all model predictions
    models = [preds1, preds2, preds3, preds4, preds5, preds6, preds7, preds8, preds9]

    # Suggest binary variables to select models
    selected_models = [trial.suggest_int(f'select_model_{i}', 0, 1) for i in range(len(models))]

    # Ensure at least one model is selected
    if sum(selected_models) == 0:
        return float('inf')

    # Suggest weights for each model
    weights = [trial.suggest_float(f'w{i}', 0, 1) for i in range(len(models))]

    # Select models and their corresponding weights
    selected_preds = [models[i] for i in range(len(models)) if selected_models[i]]
    selected_weights = [weights[i] for i in range(len(models)) if selected_models[i]]

    # # Normalize the weights
    # total_weight = sum(selected_weights)
    # selected_weights = [w / total_weight for w in selected_weights]

    # Compute the blended predictions
    blended_preds = sum(w * p for w, p in zip(selected_weights, selected_preds))

    # Compute the MAE
    mae = mean_absolute_error(y_vals, blended_preds)

    return mae

# Create an Optuna study and optimize
study = optuna.create_study(direction='minimize', study_name='mae_with_bert')
study.optimize(objective, n_trials=500)

# Print the best parameters and score
print('Best trial:')
trial = study.best_trial
print('  Value: {}'.format(trial.value))
print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

#   Value: 0.21366240098504596

In [None]:

# Use the best found weights for final prediction
select_model_1 = trial.params['select_model_0']
select_model_2 = trial.params['select_model_1']
select_model_3 = trial.params['select_model_2']
select_model_4 = trial.params['select_model_3']
select_model_5 = trial.params['select_model_4']
select_model_6 = trial.params['select_model_5']
select_model_7 = trial.params['select_model_6']
select_model_8 = trial.params['select_model_7']
select_model_9 = trial.params['select_model_8']

best_w1 = trial.params['w0']
best_w2 = trial.params['w1']
best_w3 = trial.params['w2']
best_w4 = trial.params['w3']
best_w5 = trial.params['w4']
best_w6 = trial.params['w5']
best_w7 = trial.params['w6']
best_w8 = trial.params['w7']
best_w9 = trial.params['w8']

# Best trial:
#   Value: 0.13884898356282638
#   Params: 
#     select_model_0: 0
#     select_model_1: 1
#     select_model_2: 0
#     select_model_3: 0
#     select_model_4: 0
#     select_model_5: 1
#     select_model_6: 0
#     select_model_7: 0
#     select_model_8: 1
#     w0: 0.22379923583104128
#     w1: 0.00022255362990576655
#     w2: 0.8692260835078589
#     w3: 0.801887498510167
#     w4: 0.17577840191712701
#     w5: 0.023900882561639553
#     w6: 0.41999673367380513
#     w7: 0.5145131188279755
#     w8: 0.11767006738147931


# total_weight = best_w1 + best_w2 + best_w3
# best_w1 /= total_weight
# best_w2 /= total_weight
# best_w3 /= total_weight

# Make final ensemble predictions with the best weights
final_ensemble_pred = (select_model_1 * best_w1 * model1.predict(test)) + (select_model_2 * best_w2 * model2.predict(test)) + (select_model_3 * best_w3 * model3.predict(test)) + (select_model_4 * best_w4 * model4.predict(test)) + (select_model_5 * best_w5 * model5.predict(test)) + (select_model_6 * best_w6 * model6.predict(test)) + (select_model_7 * best_w7 * model7.predict(test)) + (select_model_8 * best_w8 * model8.predict(test)) + (select_model_9 * best_w9 * model9.predict(test))

# Adjust validation target data to replace negative values with 0.05
final_ensemble_pred_adjusted = np.maximum(final_ensemble_pred, 0.05)

In [None]:

# Define the objective function for Optuna
def objective(trial):
    # Suggest weights for each model
    w1 = trial.suggest_float('w1', 0, 1)
    w2 = trial.suggest_float('w2', 0, 1)
    w3 = trial.suggest_float('w3', 0, 1)
    w4 = trial.suggest_float('w4', 0, 1)
    w5 = trial.suggest_float('w5', 0, 1)
    w6 = trial.suggest_float('w6', 0, 1)
    w7 = trial.suggest_float('w7', 0, 1)
    w8 = trial.suggest_float('w8', 0, 1)
    w9 = trial.suggest_float('w9', 0, 1)

    # # Normalize the weights
    # total_weight = w1 + w2 + w3
    # w1 /= total_weight
    # w2 /= total_weight
    # w3 /= total_weight

    # Compute the blended predictions
    blended_preds = w1 * preds1 + w2 * preds2 + w3 * preds3 + w4 * preds4 + w5 * preds5 + w6 * preds6 + w7 * preds7 + w8 * preds8 + w9 * preds9

    # Compute the MAE
    mae = mean_absolute_error(y_vals, blended_preds)

    return mae

# Create an Optuna study and optimize
study = optuna.create_study(direction='minimize', study_name='mae_with_bert')
study.optimize(objective, n_trials=500)

# Print the best parameters and score
print('Best trial:')
trial = study.best_trial
print('  Value: {}'.format(trial.value))
print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

In [None]:

# Use the best found weights for final prediction
best_w1 = trial.params['w1']
best_w2 = trial.params['w2']
best_w3 = trial.params['w3']
best_w4 = trial.params['w4']
best_w5 = trial.params['w5']
best_w6 = trial.params['w6']
best_w7 = trial.params['w7']
best_w8 = trial.params['w8']
best_w9 = trial.params['w9']

# Best trial:
#   Value: 0.21366240098504596
#   Params: 
#     w1: 0.08471910659519587
#     w2: 3.409643176334812e-05
#     w3: 0.08501403202520018
#     w4: 0.04662479238055933
#     w5: 0.17637806864578287
#     w6: 0.03739525358225952
#     w7: 0.02184565223397708
#     w8: 0.3732534808432757
#     w9: 0.017576063578009754


# total_weight = best_w1 + best_w2 + best_w3
# best_w1 /= total_weight
# best_w2 /= total_weight
# best_w3 /= total_weight

# Make final ensemble predictions with the best weights
final_ensemble_pred = best_w1 * model1.predict(test) + best_w2 * model2.predict(test) + best_w3 * model3.predict(test) + best_w4 * model4.predict(test) + best_w5 * model5.predict(test) + best_w6 * model6.predict(test) + best_w7 * model7.predict(test) + best_w8 * model8.predict(test) + best_w9 * model9.predict(test)

# Adjust validation target data to replace negative values with 0.05
final_ensemble_pred_adjusted = np.maximum(final_ensemble_pred, 0.05)

In [None]:
ensemble_score = mean_absolute_error(target_test_data, final_ensemble_pred)
ensemble_score

In [None]:
final_ensemble_pred_df = pd.DataFrame(final_ensemble_pred, columns=['Predicted'])

test_true_pred_df = pd.concat([target_test_data, final_ensemble_pred_df], axis=1)

In [None]:
test_true_pred_df.sort_values(by='vps', ascending=False).head(15)

In [None]:
test_true_pred_df.sort_values(by='vps', ascending=False).iloc[[12, 22, 32, 42, 52, 62, 72, 82, 92, 102]]