## SVM

In [1]:
#!/usr/bin/env python3.9
# -*- coding: utf-8 -*-
"""
Author: Letícia Tavares
Date: 2024-08-06
Version: 1.0.0

Description:
    This script trains and evaluates a Linear Support Vector Classification (SVC) model using multi-output classification on genre classification data.
    It utilizes the `analysis_functions` module to load and prepare the data, perform hyperparameter tuning with Grid Search,
    and evaluate the model's performance. The results are saved in a CSV file.

    The script performs the following steps:
    1. Loads genre classification data using a custom function.
    2. Prepares dataframes for all genres and Brazilian genres.
    3. Defines and trains a LinearSVC model using multi-output classification with a pipeline that includes scaling.
    4. Performs Grid Search with cross-validation to tune hyperparameters.
       The parameter grid includes:
       - `C`: [0.1, 0.5, 1.0, 2.0, 5.0]
       - `loss`: ['squared_hinge']
    5. Evaluates the model using F1 scores (micro and macro).
    6. Saves the results to a CSV file, including the combination of parameters that generates the best result.


Usage:
    1. Ensure all dependencies are installed and accessible.
    2. Ensure the `functions` directory is in the correct path and contains `analysis_functions.py`.
    3. Run the script: python 02_SVC_F1_all_genres.py

Notes:
    - Adjust paths and filenames as needed.
    - Results are saved to 'svc_results_F1_all_genres.csv' in the specified output directory.
"""

# Standard library imports
import os  # Operating system interface
import sys  # System-specific parameters and functions

# Third-party library imports
import pandas as pd  # Data manipulation and analysis
from loguru import logger  # Logging

from sklearn.model_selection import train_test_split, GridSearchCV, KFold  # Model selection
from sklearn.preprocessing import StandardScaler  # Data preprocessing
from sklearn.pipeline import Pipeline  # Pipeline for combining multiple steps
from sklearn.svm import LinearSVC  # Support Vector Classification
from sklearn.multioutput import MultiOutputClassifier  # Multi-output classification
from sklearn.preprocessing import LabelEncoder

# Local application/library specific imports
notebook_dir = os.getcwd()

# Adiciona o caminho relativo ao diretório 'functions' na lista de caminhos de importação
sys.path.insert(0, os.path.abspath(os.path.join(notebook_dir, '../functions')))
import analysis_functions
from analysis_functions import folder_output


# Load data
logger.info("Loading data...")
df, all_genres, br_genres = analysis_functions.get_data()

# Prepare dataframes for all genres and BR genres
logger.info("Preparing dataframes for all genres and BR genres...")
df_all_genres = analysis_functions.make_df_genres(df, all_genres)
df_br_genres = analysis_functions.make_df_genres(df, br_genres)

# Load feature group model for artist
logger.info("Loading feature group model for artist...")
feat_group_model = analysis_functions.dict_feature_group()

def train_and_evaluate_linear_svc(X, y, feature_group_name, genres):
    logger.info(f"Training and evaluating LinearSVC for feature group: {feature_group_name}")

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Define the pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', LinearSVC())
    ])

    # Define the hyperparameter grid
    param_grid = {
        'model__C': [0.1, 0.5, 1.0, 2.0, 5.0],
        'model__loss': ['squared_hinge']
    }

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, n_jobs=-1,
                               scoring=['f1_micro', 'f1_macro'], refit=False,
                               verbose=3, return_train_score=True, 
                               cv=KFold(n_splits=5, shuffle=True, random_state=42))
                                
    grid_search.fit(X_train, y_train)

    # Get the best model from grid search
    results_df = pd.DataFrame(grid_search.cv_results_)
    results_df['feature_group'] = feature_group_name

    # Join genres into a string
    genres = ",".join(str(element) for element in genres)
    results_df['genre_labels'] = genres

    logger.info("Results compiled into dataframe.")
    return results_df

# Execute model training and evaluation for BR genres and all genres
# logger.info("Executing model training and evaluation for All genres...")
df_results_SVC = analysis_functions.exec_model(train_and_evaluate_linear_svc, df, br_genres, feat_group_model, "SVC", False)

# Combine results and save to CSV
df_results_SVC.to_csv(f'{folder_output}/svc_results_F1_br_genres_more_feats.csv', index=False)
logger.success(f"Results saved to '{folder_output}/svc_results_F1_br_genres_more_feats.csv'")

[32m2024-08-06 23:18:08.349[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m60[0m - [1mLoading data...[0m
[32m2024-08-06 23:18:12.103[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m64[0m - [1mPreparing dataframes for all genres and BR genres...[0m
[32m2024-08-06 23:18:12.113[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m69[0m - [1mLoading feature group model for artist...[0m
[32m2024-08-06 23:18:12.146[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: statistical[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-06 23:18:19.146[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:18:19.152[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: statistical_time[0m
[32m2024-08-06 23:18:19.304[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:18:19.305[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: explicitness[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-06 23:18:19.370[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:18:19.372[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: pronouns[0m
[32m2024-08-06 23:18:19.547[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:18:19.548[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: postags[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-06 23:18:19.999[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:18:20.003[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: lemma[0m
[32m2024-08-06 23:18:20.110[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:18:20.112[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: afinn[0m
[32m2024-08-06 23:18:20.170[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:18:20.171[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-06 23:18:20.230[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:18:20.232[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: rid[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-06 23:18:20.709[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:18:20.717[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: audio[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-06 23:18:21.097[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:18:21.102[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: combined[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-06 23:18:37.069[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:18:37.083[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: combined + audio[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-06 23:18:52.388[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:18:52.422[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m115[0m - [32m[1mResults saved to '../02_Output_Kfold_Models/svc_results_F1_br_genres_more_feats.csv'[0m


In [2]:
df_results_SVC = analysis_functions.exec_model(train_and_evaluate_linear_svc, df, all_genres, feat_group_model, "SVC", False)

# Combine results and save to CSV
df_results_SVC.to_csv(f'{folder_output}/svc_results_F1_all_genres_more_feats.csv', index=False)
logger.success(f"Results saved to '{folder_output}/svc_results_F1_all_genres_more_feats.csv'")

[32m2024-08-06 23:18:52.473[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: statistical[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-06 23:19:07.512[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:19:07.515[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: statistical_time[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-06 23:19:07.741[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:19:07.742[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: explicitness[0m
[32m2024-08-06 23:19:07.843[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:19:07.844[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: pronouns[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-06 23:19:08.419[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:19:08.421[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: postags[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-06 23:19:09.803[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:19:09.808[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: lemma[0m
[32m2024-08-06 23:19:09.967[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:19:09.969[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: afinn[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-06 23:19:10.071[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:19:10.072[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: vader[0m
[32m2024-08-06 23:19:10.166[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:19:10.169[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: rid[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-06 23:19:11.524[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:19:11.530[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: audio[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-06 23:19:12.794[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:19:12.813[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: combined[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-06 23:19:49.458[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:19:49.481[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: combined + audio[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-06 23:20:23.613[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:20:23.655[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [32m[1mResults saved to '../02_Output_Kfold_Models/svc_results_F1_all_genres_more_feats.csv'[0m


In [3]:
feat_group_model = analysis_functions.dict_feature_group_art()

df_results_SVC = analysis_functions.exec_model(train_and_evaluate_linear_svc, df, all_genres, feat_group_model, "SVC", True)

# Combine results and save to CSV
df_results_SVC.to_csv(f'{folder_output}/svc_results_F1_all_genres.csv', index=False)
logger.success(f"Results saved to '{folder_output}/svc_results_F1_all_genres.csv'")

[32m2024-08-06 23:20:23.698[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: statistical[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-06 23:20:38.833[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:20:38.837[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: statistical_time[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-06 23:20:39.064[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:20:39.065[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: explicitness[0m
[32m2024-08-06 23:20:39.155[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:20:39.157[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: audio[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-06 23:20:40.449[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-06 23:20:43.335[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: tf-idf[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-07 00:32:00.973[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-07 00:32:01.029[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: lda[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-07 00:32:03.852[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-07 00:32:03.892[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: combined[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-07 00:32:24.532[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-07 00:32:25.237[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: combined (tf-idf + lda)[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-07 01:45:57.884[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-07 01:45:57.924[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: combined + audio[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-07 01:46:31.421[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-07 01:46:31.785[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: combined (tf-idf + lda) + audio[0m


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[32m2024-08-07 02:47:33.758[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m106[0m - [1mResults compiled into dataframe.[0m
[32m2024-08-07 02:47:35.101[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [32m[1mResults saved to '../02_Output_Kfold_Models/svc_results_F1_all_genres.csv'[0m


## Rede Neural

In [1]:
#!/usr/bin/env python3.6
# -*- coding: utf-8 -*-
"""
Author: Letícia Tavares
Date: 2024-08-06
Version: 1.0.0

Description:
    This script trains and evaluates a Neural Network model for genre classification.
    The model is built using TensorFlow and Keras, with multiple dense layers and dropout for regularization.
    It utilizes the `analysis_functions` module to load and prepare the data, performs K-Fold cross-validation,
    and evaluates the model's performance. The results are saved in a CSV file.

    The script performs the following steps:
    1. Loads genre classification data using a custom function.
    2. Prepares dataframes for all genres and Brazilian genres.
    3. Defines and trains a Neural Network model using K-Fold cross-validation with different configurations.
    4. Evaluates the model's performance using accuracy and F1 scores.
    5. Saves the best results to a CSV file, including the combination of parameters that generates the best result.

    Parameters and Combinations:
    - Dense Sizes: (32, 32), (64, 64)
    - Dropout Rates: 0.1
    - Epochs: 50
    - Batch Sizes: 2

Usage:
    1. Ensure all dependencies are installed and accessible.
    2. Ensure the `functions` directory is in the correct path and contains `analysis_functions.py`.
    3. Run the script: python 02_Neural_Network_F1_br_genres_more_feats.py

Notes:
    - Adjust paths and filenames as needed.
    - Results are saved to 'neural_network_results_F1_br_genres_more_feats.csv' in the specified output directory.
"""

# Standard library imports
import os  # Operating system interface
import sys  # System-specific parameters and functions

# Third-party library imports
import numpy as np  # Numerical operations
import pandas as pd  # Data manipulation and analysis
import tensorflow  as tf # Deep learning library
from loguru import logger  # Logging
from sklearn.preprocessing import StandardScaler  # Data preprocessing
from sklearn.model_selection import KFold  # Model selection
from sklearn.metrics import accuracy_score, f1_score  # Performance metrics


# Adiciona o caminho relativo ao diretório 'functions' na lista de caminhos de importação
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../functions')))
import analysis_functions
from analysis_functions import folder_output

# Load data
logger.info("Loading data...")
df, all_genres, br_genres = analysis_functions.get_data()

# Prepare dataframes for all genres and BR genres
logger.info("Preparing dataframes for all genres and BR genres...")
df_all_genres = analysis_functions.make_df_genres(df, all_genres)
df_br_genres = analysis_functions.make_df_genres(df, br_genres)

# Load feature group model for artist
logger.info("Loading feature group model for artist...")
feat_group_model = analysis_functions.dict_feature_group()

def create_nn_model(dense_sizes=(32, 32), dropout_rate=0.1, input_shape=None, output_shape=None):
    inp = tf.keras.layers.Input(shape=(input_shape,))
    
    # Dense layers
    layer = inp
    for size in dense_sizes:
        layer = tf.keras.layers.Dense(size, activation="selu", kernel_initializer="lecun_normal")(layer)
        layer = tf.keras.layers.Dropout(dropout_rate)(layer)

    # Output layer
    out = tf.keras.layers.Dense(output_shape, activation="softmax")(layer)
    
    # Create the model
    model = tf.keras.models.Model(inputs=inp, outputs=out)
    model.compile(optimizer=tf.keras.optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

def train_and_evaluate_nn(X, y, feature_group_name, genres):
    logger.info(f"Training and evaluating Neural Network for feature group: {feature_group_name}")

    results = []
    genres = ",".join(str(element) for element in genres)

    # Parameters
    dense_sizes_list = [(32, 32),(64, 64)]
    dropout_rates = [0.1]
    epochs = 50
    batch_size = 2

    # Perform K-Fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    # Determine the number of classes
    output_shape = np.max(y) + 1
    
    for dense_sizes in dense_sizes_list:
        for dropout_rate in dropout_rates:
            fold_accuracies = []
            fold_f1_micro_scores = []
            fold_f1_macro_scores = []

            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]

                model = create_nn_model(dense_sizes=dense_sizes, dropout_rate=dropout_rate, input_shape=X.shape[1], output_shape = output_shape)
                
                # Normalize data
                scaler = StandardScaler()
                X_train_scaled = scaler.fit_transform(X_train)
                X_test_scaled = scaler.transform(X_test)

                # Train the model
                model.fit(X_train_scaled, y_train, epochs=epochs, batch_size=batch_size, verbose=0)

                # Evaluate the model
                y_pred = model.predict(X_test_scaled)
                y_pred = np.argmax(y_pred, axis=1)

                accuracy = accuracy_score(y_test, y_pred)
                f1_micro = f1_score(y_test, y_pred, average='micro')
                f1_macro = f1_score(y_test, y_pred, average='macro')

                fold_accuracies.append(accuracy)
                fold_f1_micro_scores.append(f1_micro)
                fold_f1_macro_scores.append(f1_macro)

            mean_accuracy = np.mean(fold_accuracies)
            mean_f1_micro = np.mean(fold_f1_micro_scores)
            mean_f1_macro = np.mean(fold_f1_macro_scores)
    
            results.append({
                'feature_group': feature_group_name,
                'params': {'dense_sizes': dense_sizes, 'dropout_rate': dropout_rate, 'epochs': epochs, 'batch_size': batch_size},
                'dense_sizes': dense_sizes,
                'dropout_rate': dropout_rate,
                'mean_accuracy': mean_accuracy,
                'mean_test_f1_micro': mean_f1_micro,
                'mean_test_f1_macro': mean_f1_macro,
                'epochs': epochs,
                'batch_size': batch_size,
                'genre_labels': genres,
            })

    results_df = pd.DataFrame(results)
    results_df['genre_labels'] = genres
    
    logger.success("Results compiled into dataframe.")
    
    return results_df

# Execute model training and evaluation for BR genres
logger.info("Executing model training and evaluation for BR genres...")
df_results_NN = analysis_functions.exec_model(train_and_evaluate_nn, df, br_genres, feat_group_model, "Neural Network", False)

# Combine results and save to CSV
df_results_NN.to_csv(f'{folder_output}/neural_network_results_F1_br_genres_more_feats.csv', index=False)
logger.success("Results saved to f'{folder_output}/neural_network_results_F1_br_genres_more_feats_2.csv'")


[32m2024-08-07 11:47:41.879[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m60[0m - [1mLoading data...[0m
[32m2024-08-07 11:47:46.865[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m64[0m - [1mPreparing dataframes for all genres and BR genres...[0m
[32m2024-08-07 11:47:46.885[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m69[0m - [1mLoading feature group model for artist...[0m
[32m2024-08-07 11:47:46.886[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m164[0m - [1mExecuting model training and evaluation for BR genres...[0m
[32m2024-08-07 11:47:46.916[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_nn[0m:[36m90[0m - [1mTraining and evaluating Neural Network for feature group: statistical[0m


OIEE
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 859us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 998us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 971us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 678us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 909us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 737us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 800us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 920us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 807us/step


[32m2024-08-07 12:18:40.044[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mtrain_and_evaluate_nn[0m:[36m159[0m - [32m[1mResults compiled into dataframe.[0m
[32m2024-08-07 12:18:40.046[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_nn[0m:[36m90[0m - [1mTraining and evaluating Neural Network for feature group: statistical_time[0m


[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 828us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 815us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 902us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 772us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 723us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 841us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 784us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 873us/step


[32m2024-08-07 12:49:24.320[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mtrain_and_evaluate_nn[0m:[36m159[0m - [32m[1mResults compiled into dataframe.[0m
[32m2024-08-07 12:49:24.321[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_nn[0m:[36m90[0m - [1mTraining and evaluating Neural Network for feature group: explicitness[0m


[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 735us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 679us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 879us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 848us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 997us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 911us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 899us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 850us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 871us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 686us/step


[32m2024-08-07 13:23:59.429[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mtrain_and_evaluate_nn[0m:[36m159[0m - [32m[1mResults compiled into dataframe.[0m
[32m2024-08-07 13:23:59.460[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_nn[0m:[36m90[0m - [1mTraining and evaluating Neural Network for feature group: pronouns[0m


[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 952us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 827us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 929us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 760us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 861us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 998us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 864us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1000us/step


[32m2024-08-07 14:16:49.766[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mtrain_and_evaluate_nn[0m:[36m159[0m - [32m[1mResults compiled into dataframe.[0m
[32m2024-08-07 14:16:49.768[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_nn[0m:[36m90[0m - [1mTraining and evaluating Neural Network for feature group: postags[0m


[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 912us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 989us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 965us/step


[32m2024-08-07 15:13:18.005[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mtrain_and_evaluate_nn[0m:[36m159[0m - [32m[1mResults compiled into dataframe.[0m
[32m2024-08-07 15:13:18.006[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_nn[0m:[36m90[0m - [1mTraining and evaluating Neural Network for feature group: lemma[0m


[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 932us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 846us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 791us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 887us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 857us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 717us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 877us/step


[32m2024-08-07 16:06:42.223[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mtrain_and_evaluate_nn[0m:[36m159[0m - [32m[1mResults compiled into dataframe.[0m
[32m2024-08-07 16:06:42.225[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_nn[0m:[36m90[0m - [1mTraining and evaluating Neural Network for feature group: afinn[0m


[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 899us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 746us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 837us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 968us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 964us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 788us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 756us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 812us/step


[32m2024-08-07 16:56:02.070[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mtrain_and_evaluate_nn[0m:[36m159[0m - [32m[1mResults compiled into dataframe.[0m
[32m2024-08-07 16:56:02.071[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_nn[0m:[36m90[0m - [1mTraining and evaluating Neural Network for feature group: vader[0m


[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 745us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 701us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 809us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 701us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 720us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 716us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 757us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 872us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 857us/step


[32m2024-08-07 17:38:33.508[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mtrain_and_evaluate_nn[0m:[36m159[0m - [32m[1mResults compiled into dataframe.[0m
[32m2024-08-07 17:38:33.512[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_nn[0m:[36m90[0m - [1mTraining and evaluating Neural Network for feature group: rid[0m


[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 730us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 801us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 706us/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


KeyboardInterrupt: 

In [8]:
feat_group_model = analysis_functions.dict_feature_group()

In [10]:
#!/usr/bin/env python3.9
# -*- coding: utf-8 -*-
"""
Author: Letícia Tavares
Date: 2024-08-06
Version: 1.0.0

Description:
    This script trains and evaluates a Linear Support Vector Classification (SVC) model using multi-output classification on genre classification data.
    It utilizes the `analysis_functions` module to load and prepare the data, perform hyperparameter tuning with Grid Search,
    and evaluate the model's performance. The results are saved in a CSV file.

    The script performs the following steps:
    1. Loads genre classification data using a custom function.
    2. Prepares dataframes for all genres and Brazilian genres.
    3. Defines and trains a LinearSVC model using multi-output classification with a pipeline that includes scaling.
    4. Performs Grid Search with cross-validation to tune hyperparameters.
       The parameter grid includes:
       - `C`: [0.1, 0.5, 1.0, 2.0, 5.0]
       - `loss`: ['squared_hinge']
    5. Evaluates the model using F1 scores (micro and macro).
    6. Saves the results to a CSV file, including the combination of parameters that generates the best result.


Usage:
    1. Ensure all dependencies are installed and accessible.
    2. Ensure the `functions` directory is in the correct path and contains `analysis_functions.py`.
    3. Run the script: python 02_SVC_F1_all_genres_more_feats.py

Notes:
    - Adjust paths and filenames as needed.
    - Results are saved to 'svc_results_F1_all_genres_more_feats.csv' in the specified output directory.
"""

# Standard library imports
import os  # Operating system interface
import sys  # System-specific parameters and functions

# Third-party library imports
import pandas as pd  # Data manipulation and analysis
import random # Random number generation and related operations 
from loguru import logger  # Logging

from sklearn.model_selection import train_test_split, GridSearchCV, KFold  # Model selection
from sklearn.preprocessing import StandardScaler  # Data preprocessing
from sklearn.pipeline import Pipeline  # Pipeline for combining multiple steps
from sklearn.svm import LinearSVC  # Support Vector Classification
from sklearn.multioutput import MultiOutputClassifier  # Multi-output classification

# Local application/library specific imports
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../functions')))
import analysis_functions
from analysis_functions import folder_output

# Set random seeds for reproducibility
random.seed(42)
os.environ['PYTHONHASHSEED'] = str(42)

# Load data
logger.info("Loading data...")
df, all_genres, br_genres = analysis_functions.get_data()

# Prepare dataframes for all genres and BR genres
logger.info("Preparing dataframes for all genres and BR genres...")
df_all_genres = analysis_functions.make_df_genres(df, all_genres)
df_br_genres = analysis_functions.make_df_genres(df, br_genres)

# Load feature group model for artist
logger.info("Loading feature group model for artist...")
feat_group_model = analysis_functions.dict_feature_group()

def train_and_evaluate_linear_svc(X, y, feature_group_name, genres):
    logger.info(f"Training and evaluating LinearSVC for feature group: {feature_group_name}")

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Define the pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', LinearSVC(random_state=42)),
    ])

    # Define the hyperparameter grid
    param_grid = {
        'model__C': [0.1],
        'model__loss': ['squared_hinge']
    }

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, n_jobs=-1,
                                scoring=['f1_micro', 'f1_macro'], refit=False,
                                verbose=3, return_train_score=True, 
                                cv=KFold(n_splits=5, shuffle=True, random_state=42))
                                
    grid_search.fit(X_train, y_train)

    # Get the best model from grid search
    results_df = pd.DataFrame(grid_search.cv_results_)
    results_df['feature_group'] = feature_group_name

    # Join genres into a string
    genres = ",".join(str(element) for element in genres)
    results_df['genre_labels'] = genres

    logger.success("Results compiled into dataframe.")
    return results_df


# Execute model training and evaluation for BR genres and all genres
logger.info("Executing model training and evaluation for All genres...")
df_results_SVC = analysis_functions.exec_model(train_and_evaluate_linear_svc, df, all_genres, feat_group_model, "SVC", False)

[32m2024-08-08 07:21:48.945[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m60[0m - [1mLoading data...[0m
[32m2024-08-08 07:21:52.892[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m64[0m - [1mPreparing dataframes for all genres and BR genres...[0m
[32m2024-08-08 07:21:52.901[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m69[0m - [1mLoading feature group model for artist...[0m
[32m2024-08-08 07:21:52.902[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m111[0m - [1mExecuting model training and evaluation for All genres...[0m
[32m2024-08-08 07:21:52.937[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_evaluate_linear_svc[0m:[36m73[0m - [1mTraining and evaluating LinearSVC for feature group: statistical[0m


Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [5]:
df_results_SVC.mean_test_f1_macro

0     0.470489
1     0.472991
2     0.467018
3     0.447505
4     0.389358
5     0.384934
6     0.316465
7     0.326132
8     0.311511
9     0.315592
10    0.281118
11    0.279625
12    0.285363
13    0.279272
14    0.207004
15    0.207004
16    0.230582
17    0.232620
18    0.211914
19    0.211914
20    0.224507
21    0.223958
22    0.040468
23    0.040468
Name: mean_test_f1_macro, dtype: float64

In [5]:
df_results_SVC.mean_test_f1_macro

0     0.470489
1     0.472991
2     0.467018
3     0.447505
4     0.389358
5     0.384934
6     0.316465
7     0.326132
8     0.311511
9     0.315592
10    0.281118
11    0.279625
12    0.285363
13    0.279272
14    0.207004
15    0.207004
16    0.230582
17    0.232620
18    0.211914
19    0.211914
20    0.224507
21    0.223958
22    0.040468
23    0.040468
Name: mean_test_f1_macro, dtype: float64