In [1]:
import datetime
import glob
import json
import os
import pickle
import shutil
import sys

module_path = os.path.abspath(os.path.join("../../.."))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.config import PATH_BEST_MODELS, PICKLE_PROTOCOL  # noqa: E402

In [2]:
CORPUS_KIND = "reddit"
CORPUS_NAME = "depression"

In [3]:
def get_dataframe_best_models(corpus_name, corpus_kind, measure="positive_f1"):
    base_path = os.path.join(PATH_BEST_MODELS, measure, corpus_kind, corpus_name)
    suffix = ".pkl"
    possible_files = glob.glob(f"{base_path}/result_comparison_*{suffix}")
    last_date = None

    for file in possible_files:
        date_str_sup_lim = -len(suffix)
        date_str_inf_lim = -(len(suffix) + len("2020_10_26"))  # Random date
        date_str = file[date_str_inf_lim:date_str_sup_lim]
        current_date = datetime.datetime.strptime(date_str, "%Y_%m_%d")

        if last_date is None or last_date < current_date:
            last_date = current_date

    if last_date is None:
        print("No file with the best models run was found.")
        return None
    file_path = os.path.join(
        base_path, f'result_comparison_{last_date.strftime("%Y_%m_%d")}{suffix}'
    )
    print(
        f"Loading the DataFrame with the best models for the measure {measure}: {file_path}."
    )
    with open(file_path, "rb") as f:
        dataframe = pickle.load(f)
    return dataframe

In [4]:
df = get_dataframe_best_models(
    corpus_name=CORPUS_NAME, corpus_kind=CORPUS_KIND, measure="positive_f1"
)
df

Loading the DataFrame with the best models for the measure positive_f1: /home2/loyola/unsl_erisk_2022/best_models/positive_f1/reddit/depression/result_comparison_2022_06_10.pkl.


Unnamed: 0,model_identifier,corpus_kind,corpus_name,representation,representation_information,train_file_path,random_seed,classifier_type,classifier_params,classification_report,...,accuracy,confusion_matrix,elapsed_mins,elapsed_secs,total_secs,file_name,current_measure,positive_recall,positive_precision,positive_f1
740,740.0,reddit,depression,bow,{'CountVectorizer_params': {'analyzer': 'word'...,/home2/loyola/unsl_erisk_2022/data/processed/r...,30.0,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",precision recall f1-score ...,...,0.934880,"[[593, 40], [33, 455]]",0.0,1.0,1.0,0150_model_information,positive_f1,0.93,0.92,0.93
741,741.0,reddit,depression,bow,{'CountVectorizer_params': {'analyzer': 'char_...,/home2/loyola/unsl_erisk_2022/data/processed/r...,16.0,KNeighborsClassifier,"{'algorithm': 'auto', 'leaf_size': 30, 'metric...",precision recall f1-score ...,...,0.873327,"[[522, 111], [31, 457]]",0.0,0.0,0.0,0046_model_information,positive_f1,0.94,0.80,0.87
742,742.0,reddit,depression,bow,{'CountVectorizer_params': {'analyzer': 'word'...,/home2/loyola/unsl_erisk_2022/data/processed/r...,30.0,SVC,"{'C': 8, 'break_ties': False, 'cache_size': 20...",precision recall f1-score ...,...,0.928635,"[[593, 40], [40, 448]]",0.0,13.0,13.0,0180_model_information,positive_f1,0.92,0.92,0.92
743,743.0,reddit,depression,bow,{'CountVectorizer_params': {'analyzer': 'word'...,/home2/loyola/unsl_erisk_2022/data/processed/r...,28.0,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",precision recall f1-score ...,...,0.930419,"[[596, 37], [41, 447]]",0.0,1.0,1.0,0148_model_information,positive_f1,0.92,0.92,0.92
744,744.0,reddit,depression,bow,{'CountVectorizer_params': {'analyzer': 'char_...,/home2/loyola/unsl_erisk_2022/data/processed/r...,16.0,LogisticRegression,"{'C': 128, 'class_weight': None, 'dual': False...",precision recall f1-score ...,...,0.926851,"[[589, 44], [38, 450]]",0.0,0.0,0.0,0076_model_information,positive_f1,0.92,0.91,0.92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1475,1475.0,reddit,depression,bert_tokenizer,PreTrainedTokenizer(name_or_path='roberta-base...,/home2/loyola/unsl_erisk_2022/data/interim/red...,9.0,BERT,{'model_architecture': 'BERT(  (encoder): Rob...,precision recall f1-score ...,...,0.885816,"[[548, 85], [43, 445]]",288.0,39.0,17319.0,09_model_information,positive_f1,0.91,0.84,0.87
1476,1476.0,reddit,depression,bert_tokenizer,PreTrainedTokenizer(name_or_path='roberta-base...,/home2/loyola/unsl_erisk_2022/data/interim/red...,3.0,BERT,{'model_architecture': 'BERT(  (encoder): Rob...,precision recall f1-score ...,...,0.884924,"[[547, 86], [43, 445]]",287.0,38.0,17258.0,03_model_information,positive_f1,0.91,0.84,0.87
1477,1477.0,reddit,depression,bert_tokenizer,PreTrainedTokenizer(name_or_path='roberta-base...,/home2/loyola/unsl_erisk_2022/data/interim/red...,14.0,BERT,{'model_architecture': 'BERT(  (encoder): Rob...,precision recall f1-score ...,...,0.887600,"[[550, 83], [43, 445]]",288.0,41.0,17321.0,14_model_information,positive_f1,0.91,0.84,0.88
1478,1478.0,reddit,depression,bert_tokenizer,PreTrainedTokenizer(name_or_path='roberta-base...,/home2/loyola/unsl_erisk_2022/data/interim/red...,8.0,BERT,{'model_architecture': 'BERT(  (encoder): Rob...,precision recall f1-score ...,...,0.884924,"[[548, 85], [44, 444]]",288.0,43.0,17323.0,08_model_information,positive_f1,0.91,0.84,0.87


In [5]:
df.groupby(by=["representation", "classifier_type"])["positive_f1"].max()

representation     classifier_type       
bert_tokenizer     BERT                      0.88
bow                DecisionTreeClassifier    0.90
                   KNeighborsClassifier      0.87
                   LogisticRegression        0.92
                   MLPClassifier             0.91
                   RandomForestClassifier    0.93
                   SVC                       0.92
doc2vec            DecisionTreeClassifier    0.82
                   KNeighborsClassifier      0.71
                   LogisticRegression        0.92
                   MLPClassifier             0.92
                   RandomForestClassifier    0.90
                   SVC                       0.91
lda                DecisionTreeClassifier    0.89
                   KNeighborsClassifier      0.90
                   LogisticRegression        0.90
                   MLPClassifier             0.92
                   RandomForestClassifier    0.92
                   SVC                       0.91
lsa     

In [6]:
def have_same_parameters(parameters_dict, input_file):
    parameters_dict_json = json.dumps(obj=parameters_dict, sort_keys=True)
    with open(input_file) as f:
        input_file_json = json.dumps(json.load(fp=f), sort_keys=True)
    return input_file_json == parameters_dict_json

In [7]:
def get_id(
    model_information, representation_information, measure, corpus_kind, corpus_name
):
    model_information_file_suffix = "_model*.json"
    base_path = os.path.join(
        PATH_BEST_MODELS, measure, corpus_kind, corpus_name, "selected_models"
    )

    possible_files = glob.glob(f"{base_path}/*{model_information_file_suffix}")
    max_id = 0
    current_id = 0
    already_exists = False
    for file_path in possible_files:
        file_name = os.path.basename(file_path)
        current_id = int(file_name[0:2])
        if current_id > max_id:
            max_id = current_id
        already_exists_model = have_same_parameters(model_information, file_path)
        representation_file_path = glob.glob(
            f"{base_path}/{current_id:02d}_representation_*.json"
        )[0]
        already_exists_representation = have_same_parameters(
            representation_information, representation_file_path
        )
        already_exists = already_exists_model and already_exists_representation
        if already_exists:
            print(f"The model already exists in the path {file_path}.")
            break
    model_id = current_id if already_exists else max_id + 1
    return model_id, already_exists

In [8]:
def save_best_model(
    data,
    representation,
    classifier_type,
    corpus_name,
    corpus_kind,
    measure="positive_f1",
):
    output_partial_path = os.path.join(
        PATH_BEST_MODELS, measure, corpus_kind, corpus_name, "selected_models"
    )
    os.makedirs(output_partial_path, exist_ok=True)

    cond1 = data.representation == representation
    cond2 = data.classifier_type == classifier_type
    selected_data = data[(cond1 & cond2)]

    best_model_idx = selected_data.positive_f1.argmax()

    best_model = selected_data.iloc[best_model_idx, :]

    representation_information = best_model.representation_information
    model_information = best_model.classifier_params
    train_file_path = best_model.train_file_path
    file_name = best_model.file_name

    model_id, already_exists = get_id(
        model_information=model_information,
        representation_information=representation_information,
        measure=measure,
        corpus_kind=corpus_kind,
        corpus_name=corpus_name,
    )
    if already_exists:
        print("The model was saved previously.")
        return

    model_information_path = os.path.join(
        output_partial_path, f"{model_id:02d}_model_{classifier_type}.json"
    )
    print(f"Saving the model information in {model_information_path}.")
    with open(model_information_path, "w") as f:
        json.dump(fp=f, obj=model_information, indent="\t")

    representation_information_path = os.path.join(
        output_partial_path, f"{model_id:02d}_representation_{representation}.json"
    )
    with open(representation_information_path, "w") as f:
        json.dump(fp=f, obj=representation_information, indent="\t")

    representation_path = os.path.join(
        output_partial_path, f"{model_id:02d}_representation_{representation}.pkl"
    )
    input_representation_path = None

    base_path = os.path.join(
        PATH_BEST_MODELS, measure, corpus_kind, corpus_name, representation
    )
    sufix = "_model_information"
    if classifier_type != "EmbeddingLSTM" and classifier_type != "BERT":
        model_parameters_file_name = file_name[: -len(sufix)] + "_model_and_report.pkl"
        model_parameters_file_path = os.path.join(base_path, model_parameters_file_name)

        with open(model_parameters_file_path, "rb") as f:
            model, _, _, _, _, _, _, _, _ = pickle.load(f)

        model_path = os.path.join(
            output_partial_path, f"{model_id:02d}_model_{classifier_type}.pkl"
        )
        with open(model_path, "wb") as f:
            pickle.dump(model, f, protocol=PICKLE_PROTOCOL)

        if representation == "bow":
            input_representation_path = os.path.abspath(
                train_file_path[: -len("_train.pkl")] + "_features_models.pkl"
            )
        elif representation == "lda" or representation == "lsa":
            partial_path = os.path.dirname(train_file_path)
            other_file_name = os.path.basename(train_file_path)
            input_representation_name = other_file_name.replace("_corpus_", "_model_")
            input_representation_name = (
                input_representation_name[: -len("_train.pkl")] + ".pkl"
            )

            input_representation_path = os.path.join(
                partial_path, input_representation_name
            )

            files_to_copy = glob.glob(f"{partial_path}/{input_representation_name}.*")
            for aux_file in files_to_copy:
                aux_file_suffix = os.path.basename(aux_file)
                aux_file_suffix = aux_file_suffix[
                    aux_file_suffix.index(".pkl") + len(".pkl") :
                ]
                aux_file_name = os.path.basename(representation_path) + aux_file_suffix

                aux_file_new_path = os.path.join(output_partial_path, aux_file_name)

                shutil.copy2(aux_file, aux_file_new_path)

            id2word_bigram_model_name = input_representation_name.replace(
                representation, "id2word_bigram"
            )
            id2word_bigram_model_path = os.path.join(
                partial_path, id2word_bigram_model_name
            )
            print(id2word_bigram_model_path)

            new_id2word_bigram_model_name = f"{model_id:02d}_representation_{representation}_id2word_bigram_model.pkl"
            new_id2word_bigram_model_path = os.path.join(
                output_partial_path, new_id2word_bigram_model_name
            )

            shutil.copy2(id2word_bigram_model_path, new_id2word_bigram_model_path)
        elif representation == "doc2vec":
            input_representation_path = os.path.abspath(
                train_file_path[: -len("_train.pkl")] + ".model"
            )

        shutil.copy2(input_representation_path, representation_path)
    else:
        model_parameters_file_name = file_name[: -len(sufix)] + "_model_parameters.pt"
        model_parameters_file_path = os.path.join(base_path, model_parameters_file_name)
        model_path = os.path.join(
            output_partial_path, f"{model_id:02d}_model_{classifier_type}.pt"
        )

        shutil.copy2(model_parameters_file_path, model_path)

        if classifier_type == "EmbeddingLSTM":
            partial_path = os.path.dirname(train_file_path)
            input_representation_name = os.path.basename(train_file_path)
            input_representation_name = (
                input_representation_name[: -len("_train.pt")] + "_vocabulary.pkl"
            )
            input_representation_path = os.path.join(
                partial_path, input_representation_name
            )

            shutil.copy2(input_representation_path, representation_path)
        else:
            with open(representation_path, "wb") as f:
                pickle.dump(None, f, protocol=PICKLE_PROTOCOL)

In [9]:
# TODO: Choose the best models to save for each representation.
save_best_model(
    data=df,
    representation="bow",
    classifier_type="LogisticRegression",
    corpus_name=CORPUS_NAME,
    corpus_kind=CORPUS_KIND,
    measure="positive_f1",
)

save_best_model(
    data=df,
    representation="bow",
    classifier_type="RandomForestClassifier",
    corpus_name=CORPUS_NAME,
    corpus_kind=CORPUS_KIND,
    measure="positive_f1",
)

save_best_model(
    data=df,
    representation="bow",
    classifier_type="SVC",
    corpus_name=CORPUS_NAME,
    corpus_kind=CORPUS_KIND,
    measure="positive_f1",
)

Saving the model information in /home2/loyola/unsl_erisk_2022/best_models/positive_f1/reddit/depression/selected_models/01_model_LogisticRegression.json.
Saving the model information in /home2/loyola/unsl_erisk_2022/best_models/positive_f1/reddit/depression/selected_models/02_model_RandomForestClassifier.json.
Saving the model information in /home2/loyola/unsl_erisk_2022/best_models/positive_f1/reddit/depression/selected_models/03_model_SVC.json.


In [10]:
save_best_model(
    data=df,
    representation="doc2vec",
    classifier_type="MLPClassifier",
    corpus_name=CORPUS_NAME,
    corpus_kind=CORPUS_KIND,
    measure="positive_f1",
)

Saving the model information in /home2/loyola/unsl_erisk_2022/best_models/positive_f1/reddit/depression/selected_models/04_model_MLPClassifier.json.


In [11]:
save_best_model(
    data=df,
    representation="lda",
    classifier_type="RandomForestClassifier",
    corpus_name=CORPUS_NAME,
    corpus_kind=CORPUS_KIND,
    measure="positive_f1",
)

Saving the model information in /home2/loyola/unsl_erisk_2022/best_models/positive_f1/reddit/depression/selected_models/05_model_RandomForestClassifier.json.
/home2/loyola/unsl_erisk_2022/data/processed/reddit/depression/lda/depression_id2word_bigram_model_25topics.pkl


In [12]:
save_best_model(
    data=df,
    representation="lsa",
    classifier_type="LogisticRegression",
    corpus_name=CORPUS_NAME,
    corpus_kind=CORPUS_KIND,
    measure="positive_f1",
)

Saving the model information in /home2/loyola/unsl_erisk_2022/best_models/positive_f1/reddit/depression/selected_models/06_model_LogisticRegression.json.
/home2/loyola/unsl_erisk_2022/data/processed/reddit/depression/lsa/depression_id2word_bigram_model_100factors.pkl
