In [30]:
import pandas as pd
import spacy
import numpy as np
import random
import seaborn as sns
import unicodedata

In [31]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier as GB
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.ensemble import IsolationForest as IF
from sklearn.dummy import DummyClassifier as DC
from sklearn.base import ClassifierMixin, BaseEstimator
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
from copy import deepcopy

In [33]:
import spacy

In [34]:
from tqdm import tqdm

In [35]:
tqdm.pandas()

# Experiments for OoS datasets

In [36]:
oos_dev_set = pd.read_csv("data/dataset/oos_dev_set.csv", index_col=0)

In [37]:
oos_test_set = pd.read_csv("data/dataset/oos_test_set.csv", index_col=0)

In [38]:
target = "explicit"

In [39]:
oos_dev_set.columns

Index(['lyrics', 'artist', 'song', 'album_name', 'popularity', 'danceability',
       'energy', 'key', 'mode', 'valence', 'tempo', 'duration_ms',
       'acousticness', 'liveness', 'loudness', 'speechiness', 'time_signature',
       'explicit', 'nb_genres', 'year'],
      dtype='object')

In [40]:
metadata_cols = ['popularity', 'danceability','energy', 'key', 'mode', 'valence', 'tempo', 'duration_ms',
'acousticness', 'liveness', 'loudness', 'speechiness', 'time_signature', 'nb_genres', 'year']

# Custom Classifiers

In [41]:
# Isolation Forest binary classifier
class IsolationForestClassifier(ClassifierMixin, BaseEstimator):
    
    def __init__(self, **params):
        self.estimator = IF(**params)

    def fit(self, X, y):

        self.X_ = X
        self.y_ = y
        
        self.estimator.fit(X)
        
        return self
    
    def transform(self, X):
        
        preds = self.estimator.predict(X)

        # Transform the outputs of IsolationForest scikit-learn class
        return np.array(list(map(lambda p: 0 if p==1 else 1, preds)))
    
    def predict(self, X):
        return self.transform(X)
    
    def get_params(self, deep=True):
        return self.estimator.get_params()
    
    def set_params(self, **parameters):
        self.estimator.set_params(**parameters)
        return self

In [42]:
# Dictionary binary classifier
class DictionaryClassifier(ClassifierMixin, BaseEstimator):
    
    def __init__(self, random_state=None):
        self.random_state=random_state

    def fit(self, X, y):

        self.X_ = X
        self.y_ = y
                        
        return self
    
    def transform(self, X):
        
        # Sums transformed input values and assings to true if it's bigger than 0
        # To work as a dictionary, one needs to use this class after transforming
        # textual attributes to binary vectors containing only dictionary terms
        # In this work, this was done using the tfidfvectorizer
        preds = [int(x.sum() > 0) for x in X]

        return np.array(preds)
    
    def predict(self, X):
        return self.transform(X)

In [43]:
CV_NB = 5 # number of folds during k-fold cross-validation
COMBINATION_NB = 100 # number of k-fold cross-validation runs to find best hyperparams
METRICS = ["f1", "recall", "f1_macro", "recall_macro"] # metrics to collect during cross validation
FOCUS_METRIC = "f1" # focus metric to decide best model for each algorithm

In [44]:
CLF_SEED = 42 # seed number to pass to each classifier object
CV_SEED = 30 # seed number to pass to each cross-validation object

# Metadata Only

In [45]:
oos_dev_metadata = oos_dev_set[metadata_cols]
oos_dev_target = oos_dev_set[target]

In [46]:
# Classifiers and tunable hyperparams and values
clfs = {
    "DUMMY": {
        'model': DC,
        'params': {
            'clf__strategy': ['most_frequent', 'uniform', 'stratified']
        }
    },
    "GB": {
        'model': GB,
        'params': {
            'scaler__unit_variance': [True, False],
            'clf__loss': ['deviance', 'exponential'],
            'clf__n_estimators': range(25, 201, 25),
            'clf__learning_rate': np.geomspace(0.01, 0.2, 10),
            'clf__max_depth': [3, 4, 5],
            'clf__max_features': ['sqrt', 'log2', None],
            'clf__min_impurity_decrease': [0.00, 0.03, 0.06, 0.09, 0.12]
        }
    },
    "RF": {
        'model': RF,
        'params': {
            'scaler__unit_variance': [True, False],
            'clf__criterion': ['gini', 'entropy'],
            'clf__n_estimators': range(25, 201, 25),
            'clf__max_depth': range(3, 9, 1),
            'clf__max_features': ['sqrt', 'log2', None],
            'clf__min_impurity_decrease': [0.00, 0.03, 0.06, 0.09, 0.12],
            'clf__class_weight': ['balanced', 'balanced_subsample', None]
        }
    },
    "IF":{
        'model': IsolationForestClassifier,
        'params': {
            'scaler__unit_variance': [True, False],
            'clf__n_estimators': range(25, 201, 25),
            'clf__bootstrap': [True, False],
            'clf__contamination': ['auto', *list(np.geomspace(0.01, 0.15, 10))]
        }
    }
}

In [47]:
rcvs = {}

for key in clfs.keys():
    print(key)
        
    chosen_clf = clfs[key]['model']
    possible_params = clfs[key]['params']
    
    pipe_model = Pipeline(steps=[("scaler", RobustScaler()),
                                ("clf", chosen_clf(random_state=CLF_SEED))]
                )
        
    rcv = RandomizedSearchCV(pipe_model, possible_params,
                           scoring=METRICS,
                           refit=FOCUS_METRIC,
                           n_iter=COMBINATION_NB, cv=CV_NB, n_jobs=-1,
                           return_train_score=True,
                           verbose=1, random_state=CV_SEED)

    X = oos_dev_metadata.values
    y = oos_dev_target.values
    
    rcv.fit(X=X, y=y)
    
    rcvs[key] = rcv
    
    print("------------------------")

DUMMY
Fitting 5 folds for each of 3 candidates, totalling 15 fits
------------------------
GB
Fitting 5 folds for each of 100 candidates, totalling 500 fits




------------------------
RF
Fitting 5 folds for each of 100 candidates, totalling 500 fits
------------------------
IF
Fitting 5 folds for each of 100 candidates, totalling 500 fits
------------------------


In [51]:
oos_test_metadata = oos_test_set[metadata_cols]
oos_test_target = oos_test_set[target]

oos_metadata_metrics = pd.DataFrame(columns=["train_f1", "train_recall", "train_f1_macro", "train_recall_macro",
                                             "test_f1", "test_recall", "test_f1_macro", "test_recall_macro",
                                             "best_cv_score", "refit_time", "best_params"])

for key, best_rcv in tqdm(rcvs.items()):
    
    y_dev_pred = best_rcv.predict(X=oos_dev_metadata.values)
    train_f1_bin = f1_score(oos_dev_target.values, y_dev_pred)
    train_recall_bin = recall_score(oos_dev_target.values, y_dev_pred)
    train_f1_macro = f1_score(oos_dev_target.values, y_dev_pred, average='macro')
    train_recall_macro = recall_score(oos_dev_target.values, y_dev_pred, average='macro')

    y_test_pred = best_rcv.predict(X=oos_test_metadata.values)
    test_f1_bin = f1_score(oos_test_target.values, y_test_pred)
    test_recall_bin = recall_score(oos_test_target.values, y_test_pred)
    test_f1_macro = f1_score(oos_test_target.values, y_test_pred, average='macro')
    test_recall_macro = recall_score(oos_test_target.values, y_test_pred, average='macro')
    
    oos_metadata_metrics.loc[key] = [train_f1_bin, train_recall_bin, train_f1_macro, train_recall_macro,
                                     test_f1_bin, test_recall_bin, test_f1_macro, test_recall_macro,
                                     best_rcv.best_score_, best_rcv.refit_time_, str(best_rcv.best_params_)]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 10.17it/s]


In [52]:
oos_metadata_metrics

Unnamed: 0,train_f1,train_recall,train_f1_macro,train_recall_macro,test_f1,test_recall,test_f1_macro,test_recall_macro,best_cv_score,refit_time,best_params
DUMMY,0.045812,0.434783,0.352738,0.467436,0.075567,0.5,0.355196,0.487705,0.057924,0.005316,{'clf__strategy': 'uniform'}
GB,0.487179,0.354037,0.738322,0.675603,0.222222,0.133333,0.600862,0.565176,0.133524,0.232417,"{'scaler__unit_variance': False, 'clf__n_estim..."
RF,0.438881,0.925466,0.702035,0.930084,0.33871,0.7,0.637273,0.795604,0.293175,0.670009,"{'scaler__unit_variance': True, 'clf__n_estima..."
IF,0.100678,0.322981,0.506481,0.588937,0.152542,0.3,0.53733,0.591133,0.091054,0.391407,"{'scaler__unit_variance': True, 'clf__n_estima..."


In [53]:
oos_metadata_metrics.to_csv('results/oos_metadata_metrics.csv', index=True)

# Lyrics Only

In [54]:
nlp = spacy.load('pt_core_news_md')

In [55]:
nlp.Defaults.stop_words |= {'pra', 'pro'}

In [56]:
pt_stopwords = set()

for word in nlp.Defaults.stop_words:
    changed_word = word.strip().lower()
    pt_stopwords.add(changed_word)

    decoded_word = unicodedata.normalize('NFKD', changed_word)
    decoded_word = decoded_word.encode('ascii', 'ignore')
    decoded_word = decoded_word.decode("utf-8")
    
    pt_stopwords.add(decoded_word)
    
pt_stopwords = list(pt_stopwords)

In [57]:
with open('data/palavroes.txt', 'r') as f:
    offensive_content = f.readlines()

In [58]:
offensive_words = {}
extra_words = set()

for i, word in enumerate(offensive_content):
    changed_word = word.strip().lower()
    offensive_words[changed_word] = i

    decoded_word = unicodedata.normalize('NFKD', changed_word)
    decoded_word = decoded_word.encode('ascii', 'ignore')
    decoded_word = decoded_word.decode("utf-8")

    if changed_word != decoded_word:
        extra_words.add(decoded_word)
    
for i, extra_word in enumerate(extra_words, len(offensive_words)):
    offensive_words[extra_word] = i

In [59]:
oos_dev_lyrics = oos_dev_set[['lyrics']]
oos_dev_target = oos_dev_set[target]

In [60]:
clfs = {
    "DUMMY": {
        'model': DC,
        'params': {
            'clf__strategy': ['most_frequent', 'uniform', 'stratified']
        }
    },
    "GB": {
        'model': GB,
        'params': {
            'txt__vectorizer__lowercase': [True],
            'txt__vectorizer__strip_accents': ['ascii', None],
            'txt__vectorizer__ngram_range': [(1,1), (1,2), (2,2)],
            'txt__vectorizer__max_df': np.geomspace(0.8, 1.0, 10),
            'txt__vectorizer__min_df': range(1, 5, 1),
            'txt__vectorizer__max_features': range(100, 1001, 100),
            'txt__vectorizer__binary': [True, False],
            'txt__vectorizer__norm': ['l1', 'l2'],
            'txt__vectorizer__use_idf': [True, False],
            'txt__vectorizer__smooth_idf': [True, False],
            'txt__vectorizer__sublinear_tf': [True, False],
            'clf__loss': ['deviance', 'exponential'],
            'clf__n_estimators': range(25, 201, 25),
            'clf__learning_rate': np.geomspace(0.01, 0.2, 10),
            'clf__max_depth': [3, 4, 5],
            'clf__max_features': ['sqrt', 'log2', None],
            'clf__min_impurity_decrease': [0.00, 0.03, 0.06, 0.09, 0.12]
        }
    },
    "RF": {
        'model': RF,
        'params': {
            'txt__vectorizer__lowercase': [True],
            'txt__vectorizer__strip_accents': ['ascii', None],
            'txt__vectorizer__ngram_range': [(1,1), (1,2), (2,2)],
            'txt__vectorizer__max_df': np.geomspace(0.8, 1.0, 10),
            'txt__vectorizer__min_df': range(1, 5, 1),
            'txt__vectorizer__max_features': range(100, 1001, 100),
            'txt__vectorizer__binary': [True, False],
            'txt__vectorizer__norm': ['l1', 'l2'],
            'txt__vectorizer__use_idf': [True, False],
            'txt__vectorizer__smooth_idf': [True, False],
            'txt__vectorizer__sublinear_tf': [True, False],
            'clf__criterion': ['gini', 'entropy'],
            'clf__n_estimators': range(25, 201, 25),
            'clf__max_depth': range(3, 9, 1),
            'clf__max_features': ['sqrt', 'log2', None],
            'clf__min_impurity_decrease': [0.00, 0.03, 0.06, 0.09, 0.12],
            'clf__class_weight': ['balanced', 'balanced_subsample', None]
        }
    },
    "IF":{
        'model': IsolationForestClassifier,
        'params': {
            'txt__vectorizer__lowercase': [True],
            'txt__vectorizer__strip_accents': ['ascii', None],
            'txt__vectorizer__ngram_range': [(1,1), (1,2), (2,2)],
            'txt__vectorizer__max_df': np.geomspace(0.8, 1.0, 10),
            'txt__vectorizer__min_df': range(1, 5, 1),
            'txt__vectorizer__max_features': range(100, 1001, 100),
            'txt__vectorizer__binary': [True, False],
            'txt__vectorizer__norm': ['l1', 'l2'],
            'txt__vectorizer__use_idf': [True, False],
            'txt__vectorizer__smooth_idf': [True, False],
            'txt__vectorizer__sublinear_tf': [True, False],
            'clf__n_estimators': range(25, 201, 25),
            'clf__bootstrap': [True, False],
            'clf__contamination': ['auto', *list(np.geomspace(0.01, 0.15, 10))]
        }
    },
    "DIC": {
        'model': DictionaryClassifier,
        'params': {
            'txt__vectorizer__lowercase': [True],
            'txt__vectorizer__strip_accents': ['ascii'],
            'txt__vectorizer__ngram_range': [(1,1)],
            'txt__vectorizer__binary': [True],
            'txt__vectorizer__use_idf': [False],
            'txt__vectorizer__smooth_idf': [False],
            'txt__vectorizer__vocabulary': [offensive_words]
        }
    }
}

In [61]:
rcvs = {}

for key in list(clfs.keys()):
    print(key)

    chosen_clf = clfs[key]['model']
    possible_params = clfs[key]['params']
    
    pipe_model = Pipeline(steps=[("txt", ColumnTransformer([("vectorizer", 
                                                             TfidfVectorizer(stop_words=pt_stopwords), 
                                                             0)])),
                                ("clf", chosen_clf(random_state=CLF_SEED))]
                )
        
    rcv = RandomizedSearchCV(pipe_model, possible_params,
                           scoring=METRICS,
                           refit=FOCUS_METRIC,
                           n_iter=COMBINATION_NB, cv=CV_NB, n_jobs=-1,
                           return_train_score=True,
                           verbose=1, random_state=CV_SEED)

    X = oos_dev_lyrics.values
    y = oos_dev_target.values
    
    rcv.fit(X=X, y=y)
    
    rcvs[key] = rcv
    
    print("------------------------")

DUMMY
Fitting 5 folds for each of 3 candidates, totalling 15 fits




------------------------
GB
Fitting 5 folds for each of 100 candidates, totalling 500 fits
------------------------
RF
Fitting 5 folds for each of 100 candidates, totalling 500 fits
------------------------
IF
Fitting 5 folds for each of 100 candidates, totalling 500 fits
------------------------
DIC
Fitting 5 folds for each of 1 candidates, totalling 5 fits




------------------------


In [62]:
oos_test_lyrics = oos_test_set[['lyrics']]
oos_test_target = oos_test_set[target]

oos_lyrics_metrics = pd.DataFrame(columns=["train_f1", "train_recall", "train_f1_macro", "train_recall_macro",
                                             "test_f1", "test_recall", "test_f1_macro", "test_recall_macro",
                                             "best_cv_score", "refit_time", "best_params"])

for key, best_rcv in tqdm(rcvs.items()):
    
    y_dev_pred = best_rcv.predict(X=oos_dev_lyrics.values)
    train_f1_bin = f1_score(oos_dev_target.values, y_dev_pred)
    train_recall_bin = recall_score(oos_dev_target.values, y_dev_pred)
    train_f1_macro = f1_score(oos_dev_target.values, y_dev_pred, average='macro')
    train_recall_macro = recall_score(oos_dev_target.values, y_dev_pred, average='macro')

    y_test_pred = best_rcv.predict(X=oos_test_lyrics.values)
    test_f1_bin = f1_score(oos_test_target.values, y_test_pred)
    test_recall_bin = recall_score(oos_test_target.values, y_test_pred)
    test_f1_macro = f1_score(oos_test_target.values, y_test_pred, average='macro')
    test_recall_macro = recall_score(oos_test_target.values, y_test_pred, average='macro')
    
    oos_lyrics_metrics.loc[key] = [train_f1_bin, train_recall_bin, train_f1_macro, train_recall_macro,
                                     test_f1_bin, test_recall_bin, test_f1_macro, test_recall_macro,
                                     best_rcv.best_score_, best_rcv.refit_time_, str(best_rcv.best_params_)]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00,  1.07it/s]


In [63]:
oos_lyrics_metrics

Unnamed: 0,train_f1,train_recall,train_f1_macro,train_recall_macro,test_f1,test_recall,test_f1_macro,test_recall_macro,best_cv_score,refit_time,best_params
DUMMY,0.045812,0.434783,0.352738,0.467436,0.075567,0.5,0.355196,0.487705,0.057924,0.639463,{'clf__strategy': 'uniform'}
GB,0.99375,0.987578,0.996787,0.993789,0.315789,0.3,0.643397,0.636587,0.192388,5.824811,"{'txt__vectorizer__use_idf': False, 'txt__vect..."
RF,0.497512,0.621118,0.739756,0.798083,0.351351,0.433333,0.657603,0.693567,0.373154,1.134479,"{'txt__vectorizer__use_idf': True, 'txt__vecto..."
IF,0.214085,0.236025,0.594663,0.60421,0.212121,0.233333,0.5866,0.595057,0.217085,1.470122,"{'txt__vectorizer__use_idf': False, 'txt__vect..."
DIC,0.259056,0.732919,0.598024,0.81054,0.290909,0.8,0.598163,0.817288,0.270514,0.648424,"{'txt__vectorizer__vocabulary': {'anus': 0, '-..."


In [64]:
oos_lyrics_metrics.to_csv('results/oos_lyrics_metrics.csv', index=True)

# Lyrics and Metadata

In [65]:
oos_dev_meta_lyrics = oos_dev_set[['lyrics', *metadata_cols]]
oos_dev_target = oos_dev_set[target]

In [66]:
clfs = {
    "DUMMY": {
        'model': DC,
        'params': {
            'clf__strategy': ['most_frequent', 'uniform', 'stratified']
        }
    },
    "GB": {
        'model': GB,
        'params': {
            'col__scaler__unit_variance': [True, False],
            'col__vectorizer__lowercase': [True],
            'col__vectorizer__strip_accents': ['ascii'],
            'col__vectorizer__ngram_range': [(1,1), (1,2), (2,2)],
            'col__vectorizer__max_df': np.geomspace(0.8, 1.0, 10),
            'col__vectorizer__min_df': range(1, 5, 1),
            'col__vectorizer__max_features': range(100, 1001, 100),
            'col__vectorizer__binary': [True, False],
            'col__vectorizer__norm': ['l1', 'l2'],
            'col__vectorizer__use_idf': [True, False],
            'col__vectorizer__smooth_idf': [True, False],
            'col__vectorizer__sublinear_tf': [True, False],
            'clf__loss': ['deviance', 'exponential'],
            'clf__n_estimators': range(25, 201, 25),
            'clf__learning_rate': np.geomspace(0.01, 0.2, 10),
            'clf__max_depth': [3, 4, 5],
            'clf__max_features': ['sqrt', 'log2', None],
            'clf__min_impurity_decrease': [0.00, 0.03, 0.06, 0.09, 0.12]
        }
    },
    "RF": {
        'model': RF,
        'params': {
            'col__scaler__unit_variance': [True, False],
            'col__vectorizer__lowercase': [True],
            'col__vectorizer__strip_accents': ['ascii'],
            'col__vectorizer__ngram_range': [(1,1), (1,2), (2,2)],
            'col__vectorizer__max_df': np.geomspace(0.8, 1.0, 10),
            'col__vectorizer__min_df': range(1, 5, 1),
            'col__vectorizer__max_features': range(100, 1001, 100),
            'col__vectorizer__binary': [True, False],
            'col__vectorizer__norm': ['l1', 'l2'],
            'col__vectorizer__use_idf': [True, False],
            'col__vectorizer__smooth_idf': [True, False],
            'col__vectorizer__sublinear_tf': [True, False],
            'clf__criterion': ['gini', 'entropy'],
            'clf__n_estimators': range(25, 201, 25),
            'clf__max_depth': range(3, 9, 1),
            'clf__max_features': ['sqrt', 'log2', None],
            'clf__min_impurity_decrease': [0.00, 0.03, 0.06, 0.09, 0.12],
            'clf__class_weight': ['balanced', 'balanced_subsample', None]
        }
    },
    "IF":{
        'model': IsolationForestClassifier,
        'params': {
            'col__scaler__unit_variance': [True, False],
            'col__vectorizer__lowercase': [True],
            'col__vectorizer__strip_accents': ['ascii'],
            'col__vectorizer__ngram_range': [(1,1), (1,2), (2,2)],
            'col__vectorizer__max_df': np.geomspace(0.8, 1.0, 10),
            'col__vectorizer__min_df': range(1, 5, 1),
            'col__vectorizer__max_features': range(100, 1001, 100),
            'col__vectorizer__binary': [True, False],
            'col__vectorizer__norm': ['l1', 'l2'],
            'col__vectorizer__use_idf': [True, False],
            'col__vectorizer__smooth_idf': [True, False],
            'col__vectorizer__sublinear_tf': [True, False],
            'clf__n_estimators': range(25, 201, 25),
            'clf__bootstrap': [True, False],
            'clf__contamination': ['auto', *list(np.geomspace(0.01, 0.15, 10))]
        }
    },
    "DIC": {
        'model': DictionaryClassifier,
        'params': {
            'col__vectorizer__lowercase': [True],
            'col__vectorizer__strip_accents': ['ascii'],
            'col__vectorizer__ngram_range': [(1,1)],
            'col__vectorizer__binary': [True],
            'col__vectorizer__use_idf': [False],
            'col__vectorizer__smooth_idf': [False],
            'col__vectorizer__vocabulary': [offensive_words]
        }
    }
}

In [67]:
rcvs = {}

for key in list(clfs.keys()):
    print(key)
    
    chosen_clf = clfs[key]['model']
    possible_params = clfs[key]['params']
    
    if key != 'DIC':
        pipe_model = Pipeline(steps=[("col", ColumnTransformer([("scaler", 
                                                                 RobustScaler(), 
                                                                 list(range(1, 16))),

                                                                ("vectorizer", 
                                                                 TfidfVectorizer(stop_words=pt_stopwords),
                                                                 0)])),
                                    ("clf", chosen_clf(random_state=CLF_SEED))]
                            )
        
        X = oos_dev_meta_lyrics.values
    else:
        pipe_model = Pipeline(steps=[("col", ColumnTransformer([("vectorizer", 
                                                                 TfidfVectorizer(stop_words=pt_stopwords),
                                                                 0)])),
                                    ("clf", chosen_clf())]
                            )
        X = oos_dev_lyrics.values
        
        
    rcv = RandomizedSearchCV(pipe_model, possible_params,
                           scoring=METRICS,
                           refit=FOCUS_METRIC,
                           n_iter=COMBINATION_NB, cv=CV_NB, n_jobs=-1,
                           return_train_score=True,
                           verbose=1, random_state=CV_SEED)

    y = oos_dev_target.values
    
    rcv.fit(X=X, y=y)
    
    rcvs[key] = rcv
    
    print("------------------------")

DUMMY
Fitting 5 folds for each of 3 candidates, totalling 15 fits




------------------------
GB
Fitting 5 folds for each of 100 candidates, totalling 500 fits
------------------------
RF
Fitting 5 folds for each of 100 candidates, totalling 500 fits
------------------------
IF
Fitting 5 folds for each of 100 candidates, totalling 500 fits
------------------------
DIC
Fitting 5 folds for each of 1 candidates, totalling 5 fits




------------------------


In [68]:
oos_test_meta_lyrics = oos_test_set[['lyrics', *metadata_cols]]
oos_test_target = oos_test_set[target]

oos_meta_lyrics_metrics = pd.DataFrame(columns=["train_f1", "train_recall", "train_f1_macro", "train_recall_macro",
                                             "test_f1", "test_recall", "test_f1_macro", "test_recall_macro",
                                             "best_cv_score", "refit_time", "best_params"])

for key, best_rcv in tqdm(rcvs.items()):
    
    if key != 'DIC':
        y_dev_pred = best_rcv.predict(X=oos_dev_meta_lyrics.values)
        train_f1_bin = f1_score(oos_dev_target.values, y_dev_pred)
        train_recall_bin = recall_score(oos_dev_target.values, y_dev_pred)
        train_f1_macro = f1_score(oos_dev_target.values, y_dev_pred, average='macro')
        train_recall_macro = recall_score(oos_dev_target.values, y_dev_pred, average='macro')

        y_test_pred = best_rcv.predict(X=oos_test_meta_lyrics.values)
        test_f1_bin = f1_score(oos_test_target.values, y_test_pred)
        test_recall_bin = recall_score(oos_test_target.values, y_test_pred)
        test_f1_macro = f1_score(oos_test_target.values, y_test_pred, average='macro')
        test_recall_macro = recall_score(oos_test_target.values, y_test_pred, average='macro')
    else:
        y_dev_pred = best_rcv.predict(X=oos_dev_lyrics.values)
        train_f1_bin = f1_score(oos_dev_target.values, y_dev_pred)
        train_recall_bin = recall_score(oos_dev_target.values, y_dev_pred)
        train_f1_macro = f1_score(oos_dev_target.values, y_dev_pred, average='macro')
        train_recall_macro = recall_score(oos_dev_target.values, y_dev_pred, average='macro')

        y_test_pred = best_rcv.predict(X=oos_test_lyrics.values)
        test_f1_bin = f1_score(oos_test_target.values, y_test_pred)
        test_recall_bin = recall_score(oos_test_target.values, y_test_pred)
        test_f1_macro = f1_score(oos_test_target.values, y_test_pred, average='macro')
        test_recall_macro = recall_score(oos_test_target.values, y_test_pred, average='macro')
    
    oos_meta_lyrics_metrics.loc[key] = [train_f1_bin, train_recall_bin, train_f1_macro, train_recall_macro,
                                     test_f1_bin, test_recall_bin, test_f1_macro, test_recall_macro,
                                     best_rcv.best_score_, best_rcv.refit_time_, str(best_rcv.best_params_)]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.13s/it]


In [69]:
oos_meta_lyrics_metrics

Unnamed: 0,train_f1,train_recall,train_f1_macro,train_recall_macro,test_f1,test_recall,test_f1_macro,test_recall_macro,best_cv_score,refit_time,best_params
DUMMY,0.045812,0.434783,0.352738,0.467436,0.075567,0.5,0.355196,0.487705,0.057924,0.665721,{'clf__strategy': 'uniform'}
GB,1.0,1.0,1.0,1.0,0.472727,0.433333,0.725599,0.707725,0.178096,8.862804,"{'col__vectorizer__use_idf': False, 'col__vect..."
RF,0.440735,0.819876,0.705175,0.882863,0.311927,0.566667,0.626961,0.737134,0.362555,2.622577,"{'col__vectorizer__use_idf': True, 'col__vecto..."
IF,0.197183,0.217391,0.585946,0.594627,0.205882,0.233333,0.582701,0.593567,0.254769,2.307955,"{'col__vectorizer__use_idf': False, 'col__vect..."
DIC,0.259056,0.732919,0.598024,0.81054,0.290909,0.8,0.598163,0.817288,0.270514,0.702295,"{'col__vectorizer__vocabulary': {'anus': 0, '-..."


In [70]:
oos_meta_lyrics_metrics.to_csv('results/oos_meta_lyrics_metrics.csv', index=True)