In [1]:
import pandas as pd
import spacy
import numpy as np
import random
import seaborn as sns
import unicodedata

In [2]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier as GB
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.ensemble import IsolationForest as IF
from sklearn.dummy import DummyClassifier as DC
from sklearn.base import ClassifierMixin, BaseEstimator
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
from copy import deepcopy

In [4]:
import spacy

In [5]:
from tqdm import tqdm

In [6]:
tqdm.pandas()

In [7]:
oos_dev_set = pd.read_csv("data/dataset/oos_dev_set.csv", index_col=0)

In [8]:
oos_test_set = pd.read_csv("data/dataset/oos_test_set.csv", index_col=0)

In [9]:
target = "explicit"

In [10]:
oos_dev_set.columns

Index(['lyrics', 'artist', 'song', 'album_name', 'popularity', 'danceability',
       'energy', 'key', 'mode', 'valence', 'tempo', 'duration_ms',
       'acousticness', 'liveness', 'loudness', 'speechiness', 'time_signature',
       'explicit', 'nb_genres', 'year'],
      dtype='object')

In [11]:
metadata_cols = ['popularity', 'danceability','energy', 'key', 'mode', 'valence', 'tempo', 'duration_ms',
'acousticness', 'liveness', 'loudness', 'speechiness', 'time_signature', 'nb_genres', 'year']

# Custom Classifiers

In [12]:
class IsolationForestClassifier(ClassifierMixin, BaseEstimator):
    
    def __init__(self, **params):
        self.estimator = IF(**params)

    def fit(self, X, y):

        self.X_ = X
        self.y_ = y
        
        self.estimator.fit(X)
        
        return self
    
    def transform(self, X):
        
        preds = self.estimator.predict(X)

        return np.array(list(map(lambda p: 0 if p==1 else 1, preds)))
    
    def predict(self, X):
        return self.transform(X)
    
    def get_params(self, deep=True):
        return self.estimator.get_params()
    
    def set_params(self, **parameters):
        self.estimator.set_params(**parameters)
        return self

In [13]:
class DictionaryClassifier(ClassifierMixin, BaseEstimator):

    def fit(self, X, y):

        self.X_ = X
        self.y_ = y
                        
        return self
    
    def transform(self, X):
        
        preds = [int(x.sum() > 0) for x in X]

        return np.array(preds)
    
    def predict(self, X):
        return self.transform(X)

In [14]:
CV_NB = 5
COMBINATION_NB = 100
METRICS = ["f1", "recall", "f1_macro", "recall_macro"]
FOCUS_METRIC = "f1"

# Metadata Only

In [15]:
oos_dev_metadata = oos_dev_set[metadata_cols]
oos_dev_target = oos_dev_set[target]

In [16]:
clfs = {
    "DUMMY": {
        'model': DC(),
        'params': {
            'clf__strategy': ['most_frequent', 'uniform', 'stratified']
        }
    },
    "GB": {
        'model': GB(),
        'params': {
            'scaler__unit_variance': [True, False],
            'clf__loss': ['deviance', 'exponential'],
            'clf__n_estimators': range(25, 201, 25),
            'clf__learning_rate': np.geomspace(0.01, 0.2, 10),
            'clf__max_depth': [3, 4, 5],
            'clf__max_features': ['sqrt', 'log2', None],
            'clf__min_impurity_decrease': [0.00, 0.03, 0.06, 0.09, 0.12]
        }
    },
    "RF": {
        'model': RF(),
        'params': {
            'scaler__unit_variance': [True, False],
            'clf__criterion': ['gini', 'entropy'],
            'clf__n_estimators': range(25, 201, 25),
            'clf__max_depth': range(3, 9, 1),
            'clf__max_features': ['sqrt', 'log2', None],
            'clf__min_impurity_decrease': [0.00, 0.03, 0.06, 0.09, 0.12],
            'clf__class_weight': ['balanced', 'balanced_subsample', None]
        }
    },
    "IF":{
        'model': IsolationForestClassifier(),
        'params': {
            'scaler__unit_variance': [True, False],
            'clf__n_estimators': range(25, 201, 25),
            'clf__contamination': ['auto', *list(np.geomspace(0.01, 0.15, 10))]
        }
    }
}

In [17]:
rcvs = {}

for key in clfs.keys():
    print(key)
    
    np.random.seed(32)
    random.seed(24)
    
    chosen_clf = clfs[key]['model']
    possible_params = clfs[key]['params']
    
    pipe_model = Pipeline(steps=[("scaler", RobustScaler()),
                                ("clf", chosen_clf)]
                )
        
    rcv = RandomizedSearchCV(pipe_model, possible_params,
                           scoring=METRICS,
                           refit=FOCUS_METRIC,
                           n_iter=COMBINATION_NB, cv=CV_NB, n_jobs=-1,
                           return_train_score=True,
                           verbose=1)

    X = oos_dev_metadata.values
    y = oos_dev_target.values
    
    rcv.fit(X=X, y=y)
    
    rcvs[key] = rcv
    
    print("------------------------")

DUMMY
Fitting 5 folds for each of 3 candidates, totalling 15 fits




------------------------
GB
Fitting 5 folds for each of 100 candidates, totalling 500 fits
------------------------
RF
Fitting 5 folds for each of 100 candidates, totalling 500 fits
------------------------
IF
Fitting 5 folds for each of 100 candidates, totalling 500 fits
------------------------


In [18]:
oos_test_metadata = oos_test_set[metadata_cols]
oos_test_target = oos_test_set[target]

oos_metadata_metrics = pd.DataFrame(columns=["train_f1", "train_recall", "train_f1_macro", "train_recall_macro",
                                             "test_f1", "test_recall", "test_f1_macro", "test_recall_macro",
                                             "best_cv_score", "refit_time", "best_params"])

for key, best_rcv in tqdm(rcvs.items()):
        
    np.random.seed(32)
    random.seed(24)
    
    y_dev_pred = best_rcv.predict(X=oos_dev_metadata.values)
    train_f1_bin = f1_score(oos_dev_target.values, y_dev_pred)
    train_recall_bin = recall_score(oos_dev_target.values, y_dev_pred)
    train_f1_macro = f1_score(oos_dev_target.values, y_dev_pred, average='macro')
    train_recall_macro = recall_score(oos_dev_target.values, y_dev_pred, average='macro')

    y_test_pred = best_rcv.predict(X=oos_test_metadata.values)
    test_f1_bin = f1_score(oos_test_target.values, y_test_pred)
    test_recall_bin = recall_score(oos_test_target.values, y_test_pred)
    test_f1_macro = f1_score(oos_test_target.values, y_test_pred, average='macro')
    test_recall_macro = recall_score(oos_test_target.values, y_test_pred, average='macro')
    
    oos_metadata_metrics.loc[key] = [train_f1_bin, train_recall_bin, train_f1_macro, train_recall_macro,
                                     test_f1_bin, test_recall_bin, test_f1_macro, test_recall_macro,
                                     best_rcv.best_score_, best_rcv.refit_time_, str(best_rcv.best_params_)]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  8.02it/s]


In [19]:
oos_metadata_metrics

Unnamed: 0,train_f1,train_recall,train_f1_macro,train_recall_macro,test_f1,test_recall,test_f1_macro,test_recall_macro,best_cv_score,refit_time,best_params
DUMMY,0.057629,0.546584,0.360903,0.525106,0.08596,0.5,0.391508,0.523472,0.052158,0.006168,{'clf__strategy': 'uniform'}
GB,0.707692,0.571429,0.850502,0.785095,0.171429,0.1,0.575107,0.54851,0.120659,0.237554,"{'scaler__unit_variance': False, 'clf__n_estim..."
RF,0.503289,0.950311,0.737937,0.949142,0.330275,0.6,0.636909,0.754545,0.268791,1.136282,"{'scaler__unit_variance': False, 'clf__n_estim..."
IF,0.106643,0.378882,0.504562,0.607863,0.170213,0.4,0.538715,0.62623,0.103646,0.183662,"{'scaler__unit_variance': True, 'clf__n_estima..."


In [20]:
oos_metadata_metrics.to_csv('results/oos_metadata_metrics.csv', index=True)

# Lyrics Only

In [21]:
nlp = spacy.load('pt_core_news_md')

In [22]:
nlp.Defaults.stop_words |= {'pra', 'pro'}

In [23]:
pt_stopwords = set()

for word in nlp.Defaults.stop_words:
    changed_word = word.strip().lower()
    pt_stopwords.add(changed_word)

    decoded_word = unicodedata.normalize('NFKD', changed_word)
    decoded_word = decoded_word.encode('ascii', 'ignore')
    decoded_word = decoded_word.decode("utf-8")
    
    pt_stopwords.add(decoded_word)
    
pt_stopwords = list(pt_stopwords)

In [24]:
with open('data/palavroes.txt', 'r') as f:
    offensive_content = f.readlines()

In [25]:
offensive_words = {}
extra_words = set()

for i, word in enumerate(offensive_content):
    changed_word = word.strip().lower()
    offensive_words[changed_word] = i

    decoded_word = unicodedata.normalize('NFKD', changed_word)
    decoded_word = decoded_word.encode('ascii', 'ignore')
    decoded_word = decoded_word.decode("utf-8")

    if changed_word != decoded_word:
        extra_words.add(decoded_word)
    
for i, extra_word in enumerate(extra_words, len(offensive_words)):
    offensive_words[extra_word] = i

In [26]:
oos_dev_lyrics = oos_dev_set[['lyrics']]
oos_dev_target = oos_dev_set[target]

In [27]:
clfs = {
    "DUMMY": {
        'model': DC(),
        'params': {
            'clf__strategy': ['most_frequent', 'uniform', 'stratified']
        }
    },
    "GB": {
        'model': GB(),
        'params': {
            'txt__vectorizer__lowercase': [True],
            'txt__vectorizer__strip_accents': ['ascii', None],
            'txt__vectorizer__ngram_range': [(1,1), (1,2), (2,2)],
            'txt__vectorizer__max_df': np.geomspace(0.8, 1.0, 10),
            'txt__vectorizer__min_df': range(1, 5, 1),
            'txt__vectorizer__max_features': range(100, 1001, 100),
            'txt__vectorizer__binary': [True, False],
            'txt__vectorizer__norm': ['l1', 'l2'],
            'txt__vectorizer__use_idf': [True, False],
            'txt__vectorizer__smooth_idf': [True, False],
            'txt__vectorizer__sublinear_tf': [True, False],
            'clf__loss': ['deviance', 'exponential'],
            'clf__n_estimators': range(25, 201, 25),
            'clf__learning_rate': np.geomspace(0.01, 0.2, 10),
            'clf__max_depth': [3, 4, 5],
            'clf__max_features': ['sqrt', 'log2', None],
            'clf__min_impurity_decrease': [0.00, 0.03, 0.06, 0.09, 0.12]
        }
    },
    "RF": {
        'model': RF(),
        'params': {
            'txt__vectorizer__lowercase': [True],
            'txt__vectorizer__strip_accents': ['ascii', None],
            'txt__vectorizer__ngram_range': [(1,1), (1,2), (2,2)],
            'txt__vectorizer__max_df': np.geomspace(0.8, 1.0, 10),
            'txt__vectorizer__min_df': range(1, 5, 1),
            'txt__vectorizer__max_features': range(100, 1001, 100),
            'txt__vectorizer__binary': [True, False],
            'txt__vectorizer__norm': ['l1', 'l2'],
            'txt__vectorizer__use_idf': [True, False],
            'txt__vectorizer__smooth_idf': [True, False],
            'txt__vectorizer__sublinear_tf': [True, False],
            'clf__criterion': ['gini', 'entropy'],
            'clf__n_estimators': range(25, 201, 25),
            'clf__max_depth': range(3, 9, 1),
            'clf__max_features': ['sqrt', 'log2', None],
            'clf__min_impurity_decrease': [0.00, 0.03, 0.06, 0.09, 0.12],
            'clf__class_weight': ['balanced', 'balanced_subsample', None]
        }
    },
    "IF":{
        'model': IsolationForestClassifier(),
        'params': {
            'txt__vectorizer__lowercase': [True],
            'txt__vectorizer__strip_accents': ['ascii', None],
            'txt__vectorizer__ngram_range': [(1,1), (1,2), (2,2)],
            'txt__vectorizer__max_df': np.geomspace(0.8, 1.0, 10),
            'txt__vectorizer__min_df': range(1, 5, 1),
            'txt__vectorizer__max_features': range(100, 1001, 100),
            'txt__vectorizer__binary': [True, False],
            'txt__vectorizer__norm': ['l1', 'l2'],
            'txt__vectorizer__use_idf': [True, False],
            'txt__vectorizer__smooth_idf': [True, False],
            'txt__vectorizer__sublinear_tf': [True, False],
            'clf__n_estimators': range(25, 201, 25),
            'clf__contamination': ['auto', *list(np.geomspace(0.01, 0.15, 10))]
        }
    },
    "DIC": {
        'model': DictionaryClassifier(),
        'params': {
            'txt__vectorizer__lowercase': [True],
            'txt__vectorizer__strip_accents': ['ascii'],
            'txt__vectorizer__ngram_range': [(1,1)],
            'txt__vectorizer__binary': [True],
            'txt__vectorizer__use_idf': [False],
            'txt__vectorizer__smooth_idf': [False],
            'txt__vectorizer__vocabulary': [offensive_words]
        }
    }
}

In [28]:
rcvs = {}

for key in list(clfs.keys()):
    print(key)
    
    np.random.seed(32)
    random.seed(24)
    
    chosen_clf = clfs[key]['model']
    possible_params = clfs[key]['params']
    
    pipe_model = Pipeline(steps=[("txt", ColumnTransformer([("vectorizer", 
                                                             TfidfVectorizer(stop_words=pt_stopwords), 
                                                             0)])),
                                ("clf", chosen_clf)]
                )
        
    rcv = RandomizedSearchCV(pipe_model, possible_params,
                           scoring=METRICS,
                           refit=FOCUS_METRIC,
                           n_iter=COMBINATION_NB, cv=CV_NB, n_jobs=-1,
                           return_train_score=True,
                           verbose=1)

    X = oos_dev_lyrics.values
    y = oos_dev_target.values
    
    rcv.fit(X=X, y=y)
    
    rcvs[key] = rcv
    
    print("------------------------")

DUMMY
Fitting 5 folds for each of 3 candidates, totalling 15 fits




------------------------
GB
Fitting 5 folds for each of 100 candidates, totalling 500 fits
------------------------
RF
Fitting 5 folds for each of 100 candidates, totalling 500 fits
------------------------
IF
Fitting 5 folds for each of 100 candidates, totalling 500 fits
------------------------
DIC
Fitting 5 folds for each of 1 candidates, totalling 5 fits




------------------------


In [29]:
oos_test_lyrics = oos_test_set[['lyrics']]
oos_test_target = oos_test_set[target]

oos_lyrics_metrics = pd.DataFrame(columns=["train_f1", "train_recall", "train_f1_macro", "train_recall_macro",
                                             "test_f1", "test_recall", "test_f1_macro", "test_recall_macro",
                                             "best_cv_score", "refit_time", "best_params"])

for key, best_rcv in tqdm(rcvs.items()):
        
    np.random.seed(32)
    random.seed(24)
    
    y_dev_pred = best_rcv.predict(X=oos_dev_lyrics.values)
    train_f1_bin = f1_score(oos_dev_target.values, y_dev_pred)
    train_recall_bin = recall_score(oos_dev_target.values, y_dev_pred)
    train_f1_macro = f1_score(oos_dev_target.values, y_dev_pred, average='macro')
    train_recall_macro = recall_score(oos_dev_target.values, y_dev_pred, average='macro')

    y_test_pred = best_rcv.predict(X=oos_test_lyrics.values)
    test_f1_bin = f1_score(oos_test_target.values, y_test_pred)
    test_recall_bin = recall_score(oos_test_target.values, y_test_pred)
    test_f1_macro = f1_score(oos_test_target.values, y_test_pred, average='macro')
    test_recall_macro = recall_score(oos_test_target.values, y_test_pred, average='macro')
    
    oos_lyrics_metrics.loc[key] = [train_f1_bin, train_recall_bin, train_f1_macro, train_recall_macro,
                                     test_f1_bin, test_recall_bin, test_f1_macro, test_recall_macro,
                                     best_rcv.best_score_, best_rcv.refit_time_, str(best_rcv.best_params_)]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.11s/it]


In [31]:
oos_lyrics_metrics

Unnamed: 0,train_f1,train_recall,train_f1_macro,train_recall_macro,test_f1,test_recall,test_f1_macro,test_recall_macro,best_cv_score,refit_time,best_params
DUMMY,0.055728,0.055901,0.514369,0.514413,0.0,0.0,0.484938,0.491803,0.056304,0.619733,{'clf__strategy': 'stratified'}
GB,0.754717,0.621118,0.874497,0.810205,0.272727,0.2,0.624582,0.594039,0.18291,1.822797,"{'txt__vectorizer__use_idf': False, 'txt__vect..."
RF,0.473573,0.695652,0.725622,0.83013,0.263736,0.4,0.606315,0.663487,0.349886,2.619981,"{'txt__vectorizer__use_idf': True, 'txt__vecto..."
IF,0.131148,0.124224,0.553868,0.55114,0.237288,0.233333,0.601891,0.600273,0.189856,1.351412,"{'txt__vectorizer__use_idf': False, 'txt__vect..."
DIC,0.259056,0.732919,0.598024,0.81054,0.290909,0.8,0.598163,0.817288,0.270514,0.655242,"{'txt__vectorizer__vocabulary': {'anus': 0, '-..."


In [32]:
oos_lyrics_metrics.to_csv('results/oos_lyrics_metrics.csv', index=True)

# Lyrics and Metadata

In [38]:
oos_dev_meta_lyrics = oos_dev_set[['lyrics', *metadata_cols]]
oos_dev_target = oos_dev_set[target]

In [39]:
clfs = {
    "DUMMY": {
        'model': DC(),
        'params': {
            'clf__strategy': ['most_frequent', 'uniform', 'stratified']
        }
    },
    "GB": {
        'model': GB(),
        'params': {
            'col__scaler__unit_variance': [True, False],
            'col__vectorizer__lowercase': [True],
            'col__vectorizer__strip_accents': ['ascii'],
            'col__vectorizer__ngram_range': [(1,1), (1,2), (2,2)],
            'col__vectorizer__max_df': np.geomspace(0.8, 1.0, 10),
            'col__vectorizer__min_df': range(1, 5, 1),
            'col__vectorizer__max_features': range(100, 1001, 100),
            'col__vectorizer__binary': [True, False],
            'col__vectorizer__norm': ['l1', 'l2'],
            'col__vectorizer__use_idf': [True, False],
            'col__vectorizer__smooth_idf': [True, False],
            'col__vectorizer__sublinear_tf': [True, False],
            'clf__loss': ['deviance', 'exponential'],
            'clf__n_estimators': range(25, 201, 25),
            'clf__learning_rate': np.geomspace(0.01, 0.2, 10),
            'clf__max_depth': [3, 4, 5],
            'clf__max_features': ['sqrt', 'log2', None],
            'clf__min_impurity_decrease': [0.00, 0.03, 0.06, 0.09, 0.12]
        }
    },
    "RF": {
        'model': RF(),
        'params': {
            'col__scaler__unit_variance': [True, False],
            'col__vectorizer__lowercase': [True],
            'col__vectorizer__strip_accents': ['ascii'],
            'col__vectorizer__ngram_range': [(1,1), (1,2), (2,2)],
            'col__vectorizer__max_df': np.geomspace(0.8, 1.0, 10),
            'col__vectorizer__min_df': range(1, 5, 1),
            'col__vectorizer__max_features': range(100, 1001, 100),
            'col__vectorizer__binary': [True, False],
            'col__vectorizer__norm': ['l1', 'l2'],
            'col__vectorizer__use_idf': [True, False],
            'col__vectorizer__smooth_idf': [True, False],
            'col__vectorizer__sublinear_tf': [True, False],
            'clf__criterion': ['gini', 'entropy'],
            'clf__n_estimators': range(25, 201, 25),
            'clf__max_depth': range(3, 9, 1),
            'clf__max_features': ['sqrt', 'log2', None],
            'clf__min_impurity_decrease': [0.00, 0.03, 0.06, 0.09, 0.12],
            'clf__class_weight': ['balanced', 'balanced_subsample', None]
        }
    },
    "IF":{
        'model': IsolationForestClassifier(),
        'params': {
            'col__scaler__unit_variance': [True, False],
            'col__vectorizer__lowercase': [True],
            'col__vectorizer__strip_accents': ['ascii'],
            'col__vectorizer__ngram_range': [(1,1), (1,2), (2,2)],
            'col__vectorizer__max_df': np.geomspace(0.8, 1.0, 10),
            'col__vectorizer__min_df': range(1, 5, 1),
            'col__vectorizer__max_features': range(100, 1001, 100),
            'col__vectorizer__binary': [True, False],
            'col__vectorizer__norm': ['l1', 'l2'],
            'col__vectorizer__use_idf': [True, False],
            'col__vectorizer__smooth_idf': [True, False],
            'col__vectorizer__sublinear_tf': [True, False],
            'clf__n_estimators': range(25, 201, 25),
            'clf__contamination': ['auto', *list(np.geomspace(0.01, 0.15, 10))]
        }
    },
    "DIC": {
        'model': DictionaryClassifier(),
        'params': {
            'col__vectorizer__lowercase': [True],
            'col__vectorizer__strip_accents': ['ascii'],
            'col__vectorizer__ngram_range': [(1,1)],
            'col__vectorizer__binary': [True],
            'col__vectorizer__use_idf': [False],
            'col__vectorizer__smooth_idf': [False],
            'col__vectorizer__vocabulary': [offensive_words]
        }
    }
}

In [40]:
rcvs = {}

for key in clfs.keys():
    print(key)
    
    np.random.seed(32)
    random.seed(24)
    
    chosen_clf = clfs[key]['model']
    possible_params = clfs[key]['params']
    
    pipe_model = Pipeline(steps=[("col", ColumnTransformer([("scaler", 
                                                             RobustScaler(), 
                                                             list(range(1, 16))),
                                                            
                                                            ("vectorizer", 
                                                             TfidfVectorizer(stop_words=pt_stopwords),
                                                             0)])),
                                ("clf", chosen_clf)]
                )
        
    rcv = RandomizedSearchCV(pipe_model, possible_params,
                           scoring=METRICS,
                           refit=FOCUS_METRIC,
                           n_iter=COMBINATION_NB, cv=CV_NB, n_jobs=-1,
                           return_train_score=True,
                           verbose=1)

    X = oos_dev_meta_lyrics.values
    y = oos_dev_target.values
    
    rcv.fit(X=X, y=y)
    
    rcvs[key] = rcv
    
    print("------------------------")

DUMMY
Fitting 5 folds for each of 3 candidates, totalling 15 fits




------------------------
GB
Fitting 5 folds for each of 100 candidates, totalling 500 fits
------------------------
RF
Fitting 5 folds for each of 100 candidates, totalling 500 fits
------------------------
IF
Fitting 5 folds for each of 100 candidates, totalling 500 fits
------------------------
DIC
Fitting 5 folds for each of 1 candidates, totalling 5 fits




------------------------


In [41]:
oos_test_meta_lyrics = oos_test_set[['lyrics', *metadata_cols]]
oos_test_target = oos_test_set[target]

oos_meta_lyrics_metrics = pd.DataFrame(columns=["train_f1", "train_recall", "train_f1_macro", "train_recall_macro",
                                             "test_f1", "test_recall", "test_f1_macro", "test_recall_macro",
                                             "best_cv_score", "refit_time", "best_params"])

for key, best_rcv in tqdm(rcvs.items()):
        
    np.random.seed(32)
    random.seed(24)
    
    y_dev_pred = best_rcv.predict(X=oos_dev_meta_lyrics.values)
    train_f1_bin = f1_score(oos_dev_target.values, y_dev_pred)
    train_recall_bin = recall_score(oos_dev_target.values, y_dev_pred)
    train_f1_macro = f1_score(oos_dev_target.values, y_dev_pred, average='macro')
    train_recall_macro = recall_score(oos_dev_target.values, y_dev_pred, average='macro')

    y_test_pred = best_rcv.predict(X=oos_test_meta_lyrics.values)
    test_f1_bin = f1_score(oos_test_target.values, y_test_pred)
    test_recall_bin = recall_score(oos_test_target.values, y_test_pred)
    test_f1_macro = f1_score(oos_test_target.values, y_test_pred, average='macro')
    test_recall_macro = recall_score(oos_test_target.values, y_test_pred, average='macro')
    
    oos_meta_lyrics_metrics.loc[key] = [train_f1_bin, train_recall_bin, train_f1_macro, train_recall_macro,
                                     test_f1_bin, test_recall_bin, test_f1_macro, test_recall_macro,
                                     best_rcv.best_score_, best_rcv.refit_time_, str(best_rcv.best_params_)]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:06<00:00,  1.29s/it]


In [42]:
oos_meta_lyrics_metrics

Unnamed: 0,train_f1,train_recall,train_f1_macro,train_recall_macro,test_f1,test_recall,test_f1_macro,test_recall_macro,best_cv_score,refit_time,best_params
DUMMY,0.057629,0.546584,0.360903,0.525106,0.08596,0.5,0.391508,0.523472,0.055143,0.648493,{'clf__strategy': 'uniform'}
GB,0.845878,0.732919,0.921044,0.86646,0.176471,0.1,0.578001,0.549255,0.198182,3.604882,"{'col__vectorizer__use_idf': False, 'col__vect..."
RF,0.607059,0.801242,0.796073,0.888676,0.390244,0.533333,0.676183,0.739841,0.370286,2.060044,"{'col__vectorizer__use_idf': False, 'col__vect..."
IF,0.163934,0.15528,0.570703,0.567111,0.2,0.166667,0.585207,0.572156,0.202495,1.832697,"{'col__vectorizer__use_idf': True, 'col__vecto..."
DIC,0.088181,0.89441,0.365938,0.685216,0.112224,0.933333,0.310819,0.638053,0.12939,0.719949,"{'col__vectorizer__vocabulary': {'anus': 0, '-..."


In [43]:
oos_meta_lyrics_metrics.to_csv('results/oos_meta_lyrics_metrics.csv', index=True)