In [18]:
import warnings
warnings.filterwarnings('ignore')

from catboost import CatBoostClassifier
from concurrent.futures import ThreadPoolExecutor
from lightgbm import LGBMClassifier
import numpy as np
import optuna

import pandas as pd

import random

from sklearn.ensemble import ExtraTreesClassifier, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import make_scorer, log_loss, accuracy_score
from sklearn.model_selection import cross_validate, KFold, StratifiedKFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler

import time
from tqdm.notebook import tqdm

from xgboost import XGBClassifier

pd.set_option('display.max_columns', None)

experiment_name = 'baseline'

In [33]:
train = pd.read_csv('bert_large_uncased_baseline_train.csv')
test = pd.read_csv('bert_large_uncased_baseline_test.csv')

target_data = pd.read_csv('train.csv')['author']
test_index = pd.read_csv('test.csv')['id']

train.shape, test.shape, target_data.shape

((19579, 1024), (8392, 1024), (19579,))

In [3]:
le = LabelEncoder()
le.fit(target_data)
target_data_le = le.transform(target_data)
target_data_le

array([0, 1, 0, ..., 0, 0, 1])

In [4]:
X = train.copy()
y = target_data_le

n_splits = 3
k3 = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=5)

In [6]:
# Define pipelines
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=50))
])

# ridge_pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('nystroem', Nystroem(n_components=500, random_state=5)),
#     ('ridge', Ridge())
# ])

linear_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', LogisticRegression()),
])

# Manually set pipeline names
knn_pipeline.name = 'KNN'
# ridge_pipeline.name = 'Nystroem Ridge'
linear_pipeline.name = 'LR Pipeline'

In [7]:
models = [
    LogisticRegression(),
    linear_pipeline,
    LGBMClassifier(n_jobs=-1, random_state=5),
    XGBClassifier(random_state=5),
    RandomForestClassifier(random_state=5),
    ExtraTreesClassifier(random_state=5),
    HistGradientBoostingClassifier(random_state=5),
    CatBoostClassifier(random_state=5, verbose=False, early_stopping_rounds=100),
    knn_pipeline,
    # ridge_pipeline,
]

In [8]:
def evaluate_models(models, X, y, important_features, cv_split, experiment_name):
    Model_compare = pd.DataFrame(columns=['Model Name', 
                                        'Model Parameters', 
                                        'Model Train Log Loss', 
                                        'Model Test Log Loss', 
                                        'Model Test Log Loss Std', 
                                        'Model Time'])
    
    def evaluate_model(alg, idx):
        if hasattr(alg, 'name'):
            model_name = alg.name
        else:
            model_name = alg.__class__.__name__
        features = important_features.get(model_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {model_name} due to no important features.')
            return {
                'Model Name': model_name,
                'Model Parameters': str(alg.get_params()),
                'Model Train Log Loss': 0,
                'Model Test Log Loss': 0,
                'Model Test Log Loss Std': 0,
                'Model Time': "0 min 0.00 sec",
            }
        
        cv_results = cross_validate(alg, 
                                    X[features], 
                                    y, cv=cv_split, 
                                    scoring='neg_log_loss', 
                                    return_train_score=True, 
                                    n_jobs=-1)

        # Time formatting
        mean_fit_time = cv_results['fit_time'].mean()
        minutes, seconds = divmod(mean_fit_time, 60)

        # Results population
        result = {
            'Model Name': model_name,
            'Model Parameters': str(alg.get_params()),
            'Model Train Log Loss': -cv_results['train_score'].mean(),
            'Model Test Log Loss': -cv_results['test_score'].mean(),
            'Model Test Log Loss Std': cv_results['test_score'].std(),
            'Model Time': f"{int(minutes)} min {seconds:.2f} sec",
        }

        print(f'Done with {model_name}.')
        return result

    results_list = []

    with ThreadPoolExecutor(max_workers=50) as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(tqdm(models, desc='Models'))]
        for future in tqdm(futures, total=len(futures), desc='Progress'):
            result = future.result()
            results_list.append(result)

    model_compare = pd.DataFrame(results_list)

    model_compare.sort_values(by=['Model Test Log Loss'], ascending=True, inplace=True)
    model_compare.to_csv(f'{experiment_name}_results.csv', index=False)

    return model_compare

In [9]:
baseline_features = {}

for model in models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__

    baseline_features[model_name] = list(X.columns)

In [10]:
%%time

baseline_models = evaluate_models(models, X, y, baseline_features, k3, f'{experiment_name}')
baseline_models

Models:   0%|          | 0/9 [00:00<?, ?it/s]

Progress:   0%|          | 0/9 [00:00<?, ?it/s]

Done with LogisticRegression.
Done with ExtraTreesClassifier.
Done with LR Pipeline.
Done with RandomForestClassifier.
Done with LGBMClassifier.
Done with KNN.
Done with HistGradientBoostingClassifier.
Done with CatBoostClassifier.
Done with XGBClassifier.
CPU times: total: 3.16 s
Wall time: 55min 52s


Unnamed: 0,Model Name,Model Parameters,Model Train Log Loss,Model Test Log Loss,Model Test Log Loss Std,Model Time
0,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False...",0.4118485,0.57033,0.005785,0 min 13.85 sec
7,CatBoostClassifier,"{'verbose': False, 'random_state': 5, 'early_s...",0.2032693,0.576469,0.002246,32 min 47.66 sec
6,HistGradientBoostingClassifier,"{'categorical_features': None, 'early_stopping...",0.1611123,0.598627,0.006025,7 min 3.50 sec
2,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.2297708,0.609872,0.003729,3 min 26.15 sec
1,LR Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.39319,0.614112,0.008366,0 min 12.67 sec
3,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...",0.02556679,0.614505,0.008706,54 min 42.47 sec
8,KNN,"{'memory': None, 'steps': [('scaler', Standard...",0.7277288,0.761946,0.004983,0 min 1.22 sec
4,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.2269657,0.839461,0.002836,1 min 59.66 sec
5,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",2.109424e-15,0.855681,0.003275,0 min 39.65 sec


In [38]:
model1 = LogisticRegression()
model2 = linear_pipeline
model3 = LGBMClassifier(n_jobs=-1, random_state=5)
model4 = XGBClassifier(random_state=5)
model5 = ExtraTreesClassifier(random_state=5)
model6 = HistGradientBoostingClassifier(random_state=5)
model7 = CatBoostClassifier(random_state=5, verbose=False, early_stopping_rounds=100)
model8 = knn_pipeline
model9 = RandomForestClassifier(random_state=5)

- Optuna weights

In [39]:
# Lists to store predictions and true values
preds1, preds2, preds3, preds4, preds5, preds6, preds7, preds8, preds9, y_vals = [], [], [], [], [], [], [], [], [], []

# Perform cross-validation
for i, (train_index, val_index) in enumerate(k3.split(X, y)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # Fit models on the training fold
    model1.fit(X_train, y_train)
    model2.fit(X_train, y_train)
    model3.fit(X_train, y_train)
    model4.fit(X_train, y_train)
    model5.fit(X_train, y_train)
    model6.fit(X_train, y_train)
    model7.fit(X_train, y_train)
    model8.fit(X_train, y_train)
    model9.fit(X_train, y_train)

    # Make predictions on the validation fold
    pred1 = model1.predict_proba(X_val)
    pred2 = model2.predict_proba(X_val)
    pred3 = model3.predict_proba(X_val)
    pred4 = model4.predict_proba(X_val)
    pred5 = model5.predict_proba(X_val)
    pred6 = model6.predict_proba(X_val)
    pred7 = model7.predict_proba(X_val)
    pred8 = model8.predict_proba(X_val)
    pred9 = model9.predict_proba(X_val)

    # Store predictions and true values
    preds1.append(pred1)
    preds2.append(pred2)
    preds3.append(pred3)
    preds4.append(pred4)
    preds5.append(pred5)
    preds6.append(pred6)
    preds7.append(pred7)
    preds8.append(pred8)
    preds9.append(pred9)
    y_vals.append(y_val)

    print(f'Done with fold {i+1}')

# Convert lists to numpy arrays for easier manipulation
preds1 = np.concatenate(preds1)
preds2 = np.concatenate(preds2)
preds3 = np.concatenate(preds3)
preds4 = np.concatenate(preds4)
preds5 = np.concatenate(preds5)
preds6 = np.concatenate(preds6)
preds7 = np.concatenate(preds7)
preds8 = np.concatenate(preds8)
preds9 = np.concatenate(preds9)
y_vals = np.concatenate(y_vals)

Done with fold 1
Done with fold 2
Done with fold 3


In [40]:
# Save the results in text files

np.savetxt('bert_large_uncased_preds/preds1.txt', preds1)
np.savetxt('bert_large_uncased_preds/preds2.txt', preds2)
np.savetxt('bert_large_uncased_preds/preds3.txt', preds3)
np.savetxt('bert_large_uncased_preds/preds4.txt', preds4)
np.savetxt('bert_large_uncased_preds/preds5.txt', preds5)
np.savetxt('bert_large_uncased_preds/preds6.txt', preds6)
np.savetxt('bert_large_uncased_preds/preds7.txt', preds7)
np.savetxt('bert_large_uncased_preds/preds8.txt', preds8)
np.savetxt('bert_large_uncased_preds/preds9.txt', preds9)
np.savetxt('bert_large_uncased_preds/y_vals.txt', y_vals)

In [None]:
# Load the results from text files

preds1 = np.loadtxt('bert_large_uncased_preds/preds1.txt')
preds2 = np.loadtxt('bert_large_uncased_preds/preds2.txt')
preds3 = np.loadtxt('bert_large_uncased_preds/preds3.txt')
preds4 = np.loadtxt('bert_large_uncased_preds/preds4.txt')
preds5 = np.loadtxt('bert_large_uncased_preds/preds5.txt')
preds6 = np.loadtxt('bert_large_uncased_preds/preds6.txt')
preds7 = np.loadtxt('bert_large_uncased_preds/preds7.txt')
preds8 = np.loadtxt('bert_large_uncased_preds/preds8.txt')
preds9 = np.loadtxt('bert_large_uncased_preds/preds9.txt')
y_vals = np.loadtxt('bert_large_uncased_preds/y_vals.txt')

In [41]:
# Define the objective function for Optuna
def objective(trial):
    # List of all model predictions
    models = [preds1, preds2, preds3, preds4, preds5, preds6, preds7, preds8, preds9]

    # Suggest binary variables to select models
    selected_models = [trial.suggest_int(f'select_model_{i}', 0, 1) for i in range(len(models))]

    # Ensure at least one model is selected
    if sum(selected_models) == 0:
        return float('inf')

    # Suggest weights for each model
    weights = [trial.suggest_float(f'w{i}', 0, 1) for i in range(len(models))]

    # Select models and their corresponding weights
    selected_preds = [models[i] for i in range(len(models)) if selected_models[i]]
    selected_weights = [weights[i] for i in range(len(models)) if selected_models[i]]

    # # Normalize the weights
    # total_weight = sum(selected_weights)
    # selected_weights = [w / total_weight for w in selected_weights]

    # Compute the blended predictions
    blended_preds = sum(w * p for w, p in zip(selected_weights, selected_preds))

    # Compute the Log Loss
    ll = log_loss(y_vals, blended_preds)

    return ll

# Create an Optuna study and optimize
study = optuna.create_study(direction='minimize', study_name='log_loss_with_bert')
study.optimize(objective, n_trials=1000)

# Print the best parameters and score
print('Best trial:')
trial = study.best_trial
print('  Value: {}'.format(trial.value))
print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

[I 2024-06-03 23:27:01,336] A new study created in memory with name: log_loss_with_bert
[I 2024-06-03 23:27:01,366] Trial 0 finished with value: 0.5302086132715137 and parameters: {'select_model_0': 1, 'select_model_1': 0, 'select_model_2': 0, 'select_model_3': 1, 'select_model_4': 0, 'select_model_5': 0, 'select_model_6': 1, 'select_model_7': 0, 'select_model_8': 0, 'w0': 0.5286663336967504, 'w1': 0.05119766608831855, 'w2': 0.07116119358664241, 'w3': 0.0018092599924722785, 'w4': 0.5435154829017003, 'w5': 0.6839632321890737, 'w6': 0.6873590596111641, 'w7': 0.6560956692283424, 'w8': 0.8514709717533313}. Best is trial 0 with value: 0.5302086132715137.
[I 2024-06-03 23:27:01,373] Trial 1 finished with value: 0.7473772827621492 and parameters: {'select_model_0': 0, 'select_model_1': 0, 'select_model_2': 1, 'select_model_3': 0, 'select_model_4': 1, 'select_model_5': 1, 'select_model_6': 0, 'select_model_7': 1, 'select_model_8': 0, 'w0': 0.39531742340900655, 'w1': 0.6792457159653447, 'w2': 0

Best trial:
  Value: 0.5226946051067006
  Params: 
    select_model_0: 1
    select_model_1: 0
    select_model_2: 0
    select_model_3: 1
    select_model_4: 0
    select_model_5: 0
    select_model_6: 1
    select_model_7: 0
    select_model_8: 0
    w0: 0.4939613679505758
    w1: 0.06327659305718365
    w2: 0.29843211228962696
    w3: 0.15286241775388243
    w4: 0.7315059873991097
    w5: 0.531867394530327
    w6: 0.22022206970834327
    w7: 0.7522857303683776
    w8: 0.6179159277642093


In [42]:

# Use the best found weights for final prediction
select_model_1 = trial.params['select_model_0']
select_model_2 = trial.params['select_model_1']
select_model_3 = trial.params['select_model_2']
select_model_4 = trial.params['select_model_3']
select_model_5 = trial.params['select_model_4']
select_model_6 = trial.params['select_model_5']
select_model_7 = trial.params['select_model_6']
select_model_8 = trial.params['select_model_7']
select_model_9 = trial.params['select_model_8']

best_w1 = trial.params['w0']
best_w2 = trial.params['w1']
best_w3 = trial.params['w2']
best_w4 = trial.params['w3']
best_w5 = trial.params['w4']
best_w6 = trial.params['w5']
best_w7 = trial.params['w6']
best_w8 = trial.params['w7']
best_w9 = trial.params['w8']

# Best trial:
#   Value: 0.5226946051067006
#   Params: 
#     select_model_0: 1
#     select_model_1: 0
#     select_model_2: 0
#     select_model_3: 1
#     select_model_4: 0
#     select_model_5: 0
#     select_model_6: 1
#     select_model_7: 0
#     select_model_8: 0
#     w0: 0.4939613679505758
#     w1: 0.06327659305718365
#     w2: 0.29843211228962696
#     w3: 0.15286241775388243
#     w4: 0.7315059873991097
#     w5: 0.531867394530327
#     w6: 0.22022206970834327
#     w7: 0.7522857303683776
#     w8: 0.6179159277642093


# total_weight = best_w1 + best_w2 + best_w3
# best_w1 /= total_weight
# best_w2 /= total_weight
# best_w3 /= total_weight

In [43]:
# Make final ensemble predictions with the best weights
final_ensemble_pred = (select_model_1 * best_w1 * model1.predict_proba(test)) + \
                        (select_model_2 * best_w2 * model2.predict_proba(test)) + \
                        (select_model_3 * best_w3 * model3.predict_proba(test)) + \
                        (select_model_4 * best_w4 * model4.predict_proba(test)) + \
                        (select_model_5 * best_w5 * model5.predict_proba(test)) + \
                        (select_model_6 * best_w6 * model6.predict_proba(test)) + \
                        (select_model_7 * best_w7 * model7.predict_proba(test)) + \
                        (select_model_8 * best_w8 * model8.predict_proba(test)) + \
                        (select_model_9 * best_w9 * model9.predict_proba(test))

In [44]:
final_ensemble_pred_df = pd.DataFrame(final_ensemble_pred, columns=le.classes_)
final_ensemble_pred_df.head()

Unnamed: 0,EAP,HPL,MWS
0,0.08734,0.083551,0.696155
1,0.715023,0.100882,0.05114
2,0.00374,0.863015,0.00029
3,0.487364,0.376394,0.003288
4,0.41282,0.420737,0.033489


In [45]:
submission_df = pd.concat([test_index, final_ensemble_pred_df], axis=1)
submission_df.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.08734,0.083551,0.696155
1,id24541,0.715023,0.100882,0.05114
2,id00134,0.00374,0.863015,0.00029
3,id27757,0.487364,0.376394,0.003288
4,id04081,0.41282,0.420737,0.033489


In [46]:
submission_df.to_csv(f'bert_large_uncased_baseline_ensemble_3cv_{trial.value:.5f}.csv', index=False)