In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import optuna
import random
from tqdm.notebook import tqdm

from sklearn.metrics import make_scorer, log_loss
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.preprocessing import LabelEncoder

from concurrent.futures import ThreadPoolExecutor

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from pprint import pprint
import os

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

experiment_name = 'pytorch_bert'

In [2]:
# # Incase I need the possible scorers list
# import sklearn
# sklearn.metrics.get_scorer_names()

In [3]:
bert_train_features = pd.read_csv('pytorch_bert_train.csv')
bert_test_features = pd.read_csv('pytorch_bert_test.csv')

comp_train = pd.read_csv('train.csv')

In [4]:
bert_train_features.shape, bert_test_features.shape, comp_train.shape

((19579, 1024), (8392, 1024), (19579, 3))

In [5]:
comp_train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [6]:
TARGET = 'author'

In [7]:
le = LabelEncoder()
le.fit(comp_train[TARGET])

comp_train[TARGET] = le.transform(comp_train[TARGET])
comp_train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",0
1,id17569,It never once occurred to me that the fumbling...,1
2,id11008,"In his left hand was a gold snuff box, from wh...",0
3,id27763,How lovely is spring As we looked from Windsor...,2
4,id12958,"Finding nothing else, not even gold, the Super...",1


In [8]:
X = bert_train_features
y = comp_train[TARGET]

n_splits = 10
sk10 = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=5)

In [9]:
models = [
    LGBMClassifier(n_jobs=-1, random_state=5),
    # XGBClassifier(random_state=5),
    # RandomForestClassifier(random_state=5),
    # AdaBoostClassifier(random_state=5),
    # BaggingClassifier(random_state=5),
    # ExtraTreesClassifier(random_state=5),
    # HistGradientBoostingClassifier(random_state=5),
    ]

In [10]:
def evaluate_models(models, X, y, important_features, cv_split, experiment_name):
    # Create a DataFrame to store comparison results
    MLA_compare = pd.DataFrame(columns=['MLA Name', 
                                        'MLA Parameters', 
                                        'MLA Train Log Loss', 
                                        'MLA Test Log Loss', 
                                        'MLA Time'])
    
    def evaluate_model(alg, idx):
        MLA_name = alg.__class__.__name__
        features = important_features.get(MLA_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {MLA_name} due to no important features.')
            return {
                'MLA Name': MLA_name,
                'MLA Parameters': str(alg.get_params()),
                'MLA Train Log Loss': 0,
                'MLA Test Log Loss': 0,
                'MLA Time': "0 min 0.00 sec",
            }

        # Perform cross-validation
        log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
        
        cv_results = cross_validate(alg, X[features], y, 
                                    cv=cv_split, 
                                    scoring={'Log Loss': log_loss_scorer}, 
                                    return_train_score=True, 
                                    n_jobs=-1)

        # Format time
        mean_fit_time = cv_results['fit_time'].mean()
        minutes, seconds = divmod(mean_fit_time, 60)

        print(f'Done with {MLA_name}.')
        
        # Populate results
        return {
            'MLA Name': MLA_name,
            'MLA Parameters': str(alg.get_params()),
            'MLA Train Log Loss': -cv_results['train_Log Loss'].mean() if 'train_Log Loss' in cv_results else 0,
            'MLA Test Log Loss': -cv_results['test_Log Loss'].mean() if 'test_Log Loss' in cv_results else 0,
            'MLA Time': f"{int(minutes)} min {seconds:.2f} sec",
        }

    results_list = []

    # Use ThreadPoolExecutor for parallel execution
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(models)]
        for future in futures:
            result = future.result()
            if result:
                results_list.append(result)

    # Create a DataFrame from the list of dictionaries
    MLA_compare = pd.DataFrame(results_list)

    # Sort and save results
    MLA_compare.sort_values(by=['MLA Test Log Loss'], ascending=True, inplace=True)
    MLA_compare.to_csv(f'{experiment_name}_results.csv', index=False)

    return MLA_compare

In [11]:
# def evaluate_models(models, X, y, important_features, cv_split, experiment_name):
#     # Create a DataFrame to store comparison results
#     MLA_compare = pd.DataFrame(columns=['MLA Name', 
#                                         'MLA Parameters', 
#                                         'MLA Train Log Loss', 
#                                         'MLA Test Log Loss', 
#                                         'MLA Time'])
    
#     def evaluate_model(alg, idx):
#         MLA_name = alg.__class__.__name__
#         features = important_features.get(MLA_name, [])

#         # Check if the list of important features is empty
#         if len(features) == 0:
#             # If empty, return results with zero values
#             print(f'Skipping {MLA_name} due to no important features.')
#             return {
#                 'MLA Name': MLA_name,
#                 'MLA Parameters': str(alg.get_params()),
#                 'MLA Train Log Loss': 0,
#                 'MLA Test Log Loss': 0,
#                 'MLA Time': "0 min 0.00 sec",
#             }

#         cv_results = cross_validate(alg, 
#                                     X[features], 
#                                     y, 
#                                     cv=cv_split, 
#                                     scoring='neg_log_loss', 
#                                     return_train_score=True, 
#                                     n_jobs=-1)

#         # Format time
#         mean_fit_time = cv_results['fit_time'].mean()
#         minutes, seconds = divmod(mean_fit_time, 60)

#         print(f'Done with {MLA_name}.')
        
#         # Populate results
#         return {
#             'MLA Name': MLA_name,
#             'MLA Parameters': str(alg.get_params()),
#             'MLA Train Log Loss': -cv_results['train_Log Loss'].mean() if 'train_Log Loss' in cv_results else 0,
#             'MLA Test Log Loss': -cv_results['test_Log Loss'].mean() if 'test_Log Loss' in cv_results else 0,
#             'MLA Time': f"{int(minutes)} min {seconds:.2f} sec",
#         }

#     results_list = []

#     # Use ThreadPoolExecutor for parallel execution
#     with ThreadPoolExecutor(max_workers=10) as executor:
#         futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(models)]
#         for future in futures:
#             result = future.result()
#             if result:
#                 results_list.append(result)

#     # Create a DataFrame from the list of dictionaries
#     MLA_compare = pd.DataFrame(results_list)

#     # Sort and save results
#     MLA_compare.sort_values(by=['MLA Test Log Loss'], ascending=True, inplace=True)
#     MLA_compare.to_csv(f'{experiment_name}_results.csv', index=False)

#     return MLA_compare

In [12]:
baseline_features = {}

for model in models:
    model_name = model.__class__.__name__

    baseline_features[model_name] = list(X.columns)

In [13]:
%%time 

baseline_models = evaluate_models(models, X, y, baseline_features, sk10, experiment_name)
baseline_models

Done with LGBMClassifier.
CPU times: total: 312 ms
Wall time: 4min 23s


Unnamed: 0,MLA Name,MLA Parameters,MLA Train Log Loss,MLA Test Log Loss,MLA Time
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.287492,0.601388,2 min 38.26 sec


# Best Single Model

In [14]:
model1 = LGBMClassifier(n_jobs=-1, random_state=5)
model2 = XGBClassifier(random_state=5)
model3 = RandomForestClassifier(random_state=5)
model4 = AdaBoostClassifier(random_state=5)
model5 = BaggingClassifier(random_state=5)
model6 = ExtraTreesClassifier(random_state=5)
model7 = HistGradientBoostingClassifier(random_state=5)

In [15]:
model1_final = model1.fit(X, y)
# model2_final = model2.fit(X, y)
# model3_final = model3.fit(X, y)
# model4_final = model4.fit(X, y)
# model5_final = model5.fit(X, y)
# model6_final = model6.fit(X, y)
# model7_final = model7.fit(X, y)

In [16]:
prediction = model1_final.predict_proba(bert_test_features)

In [17]:
pred_df = pd.DataFrame(prediction)
pred_df.columns = le.classes_
pred_df.head()

Unnamed: 0,EAP,HPL,MWS
0,0.334537,0.154933,0.51053
1,0.62986,0.296828,0.073312
2,0.024738,0.970313,0.004949
3,0.746332,0.215048,0.03862
4,0.748922,0.131384,0.119693


In [18]:
submission = pd.read_csv('sample_submission.csv')
submission_df = pd.concat([submission['id'], pred_df], axis=1)
submission_df.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.334537,0.154933,0.51053
1,id24541,0.62986,0.296828,0.073312
2,id00134,0.024738,0.970313,0.004949
3,id27757,0.746332,0.215048,0.03862
4,id04081,0.748922,0.131384,0.119693


In [19]:
submission_df.to_csv('submission_lgbm_0.601388.csv', index=False)

# Ensembling

In [None]:
model1 = LGBMClassifier(n_jobs=-1, random_state=5)
model2 = XGBClassifier(random_state=5)
model3 = RandomForestClassifier(random_state=5)
model4 = AdaBoostClassifier(random_state=5)
model5 = BaggingClassifier(random_state=5)
model6 = ExtraTreesClassifier(random_state=5)
model7 = HistGradientBoostingClassifier(random_state=5)

In [None]:
%%time

model1_results, model2_results, model3_results, model4_results, model5_results, model6_results, model7_results, y_test_list = [], [], [], [], [], [], [], []

for i, (train_index, test_index) in enumerate(sk10.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model1.fit(X_train, y_train)
    model1_results.append(model1.predict_proba(X_test))

    # model2.fit(X_train, y_train)
    # model2_results.append(model2.predict_proba(X_test))

    # model3.fit(X_train, y_train)
    # model3_results.append(model3.predict_proba(X_test))

    # model4.fit(X_train, y_train)
    # model4_results.append(model4.predict_proba(X_test))

    # model5.fit(X_train, y_train)
    # model5_results.append(model5.predict_proba(X_test))

    # model6.fit(X_train, y_train)
    # model6_results.append(model6.predict_proba(X_test))

    # model7.fit(X_train, y_train)
    # model7_results.append(model7.predict_proba(X_test))

    y_test_list.append(y_test)

    print(f'Done with fold {i+1}.')

In [None]:
%%time

model1_weights, model2_weights, model3_weights, model4_weights, model5_weights, model6_weights, model7_weights, scores = [], [], [], [], [], [], [], []

for i in tqdm(range(20000)):
    weight_1 = np.random.random_sample(size=1)[0]
    weight_2 = np.random.random_sample(size=1)[0]
    weight_3 = np.random.random_sample(size=1)[0]
    weight_4 = np.random.random_sample(size=1)[0]
    weight_5 = np.random.random_sample(size=1)[0]
    weight_6 = np.random.random_sample(size=1)[0]
    weight_7 = np.random.random_sample(size=1)[0]

    model1_weights.append(weight_1)
    model2_weights.append(weight_2)
    model3_weights.append(weight_3)
    model4_weights.append(weight_4)
    model5_weights.append(weight_5)
    model6_weights.append(weight_6)
    model7_weights.append(weight_7)

    scores_in = []

    for j in range(10):
        weighted_pred = weight_1 * model1_results[j] 
        # + weight_2 * model2_results[j] + weight_3 * model3_results[j] + weight_4 * model4_results[j] + weight_5 * model5_results[j] + weight_6 * model6_results[j] + weight_7 * model7_results[j]
        scores_in.append(log_loss(y_test_list[j], weighted_pred))
        
    scores.append(np.mean(scores_in))