# Optimizador de Hiperparametros

Optimizador de los modelos de ML para el analisis de sentimientos

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import torch
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay

import optuna
from gensim.models import CoherenceModel

In [2]:
# Load the dataset and inspect basic information
test_df = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/test.csv', encoding='ISO-8859-1')  
df = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv', encoding='ISO-8859-1')  

In [3]:
# Número total de instancias
num_instancias = len(df)

# Conteo de patrones por clase
conteo_clases = df['sentiment'].value_counts()

# Proporción (frecuencia relativa) por clase
proporcion_clases = df['sentiment'].value_counts(normalize=True)

# Mostrar resultados
print("Número total de instancias:", num_instancias)
print("\nConteo por clase:\n", conteo_clases)
print("\nProporción por clase:\n", proporcion_clases)


Número total de instancias: 27481

Conteo por clase:
 sentiment
neutral     11118
positive     8582
negative     7781
Name: count, dtype: int64

Proporción por clase:
 sentiment
neutral     0.404570
positive    0.312288
negative    0.283141
Name: proportion, dtype: float64


In [4]:
# Número total de instancias
num_instancias = len(test_df)

# Conteo de patrones por clase
conteo_clases = test_df['sentiment'].value_counts()

# Proporción (frecuencia relativa) por clase
proporcion_clases = test_df['sentiment'].value_counts(normalize=True)

# Mostrar resultados
print("Número total de instancias:", num_instancias)
print("\nConteo por clase:\n", conteo_clases)
print("\nProporción por clase:\n", proporcion_clases)

Número total de instancias: 4815

Conteo por clase:
 sentiment
neutral     1430
positive    1103
negative    1001
Name: count, dtype: int64

Proporción por clase:
 sentiment
neutral     0.404641
positive    0.312111
negative    0.283248
Name: proportion, dtype: float64


In [12]:
df['text'] = df['text'].fillna('')
df = df.dropna(subset=['sentiment'])
test_df = test_df.dropna(subset=['sentiment'])
test_df['text'] = test_df['text'].fillna('')


train_label_encoder = LabelEncoder()
df['sentiment_label'] = train_label_encoder.fit_transform(df['sentiment'])
#df = df.sample(frac=0.2, random_state=42)  # 20% del total

test_label_encoder = LabelEncoder()
test_df['sentiment_label'] = test_label_encoder.fit_transform(test_df['sentiment'])

In [15]:
def objective(trial):
    C = trial.suggest_float("C", 0.8, 5.0, log=True)
    kernel = trial.suggest_categorical("kernel", ["rbf"]) #["linear", "rbf", "poly"]
    gamma = trial.suggest_categorical("gamma", ["scale"]) #trial.suggest_float("gamma", 1e-5, 1.0, log=True) if kernel != "linear" else "scale"
    #degree = trial.suggest_int("degree", 2, 5) if kernel == "poly" else 3

    svm_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("svc", SVC(C=C, kernel=kernel , gamma=gamma))
])
    svm_pipeline.fit(df['text'], df['sentiment_label'])
    score = svm_pipeline.score(test_df['text'], test_df['sentiment_label'])

    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)
print(study.best_params, study.best_value)


[I 2025-06-15 15:25:34,157] A new study created in memory with name: no-name-e73150e9-74fc-492e-b946-435e7348ae65
[I 2025-06-15 15:33:57,167] Trial 0 finished with value: 0.7136389360498019 and parameters: {'C': 4.4569155779335174, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 0 with value: 0.7136389360498019.
[I 2025-06-15 15:42:05,502] Trial 1 finished with value: 0.7147707979626485 and parameters: {'C': 2.6274165891211854, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 1 with value: 0.7147707979626485.
[I 2025-06-15 15:50:22,944] Trial 2 finished with value: 0.7136389360498019 and parameters: {'C': 3.219840415417644, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 1 with value: 0.7147707979626485.
[I 2025-06-15 15:58:29,199] Trial 3 finished with value: 0.7150537634408602 and parameters: {'C': 2.537412886420518, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 3 with value: 0.7150537634408602.
[I 2025-06-15 16:06:43,110] Trial 4 finished with value: 0.7147707979626485 and 

{'C': 1.6096091371852976, 'kernel': 'rbf', 'gamma': 'scale'} 0.7207130730050934


In [14]:
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 120)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    max_features = trial.suggest_categorical("max_features", ["sqrt"])

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42,
    )

    rf_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("rf", model)
    ])
    rf_pipeline.fit(df['text'], df['sentiment_label'])
    score = rf_pipeline.score(test_df['text'], test_df['sentiment_label'])

    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)
print(study.best_params, study.best_value) 


[I 2025-06-17 16:27:01,537] A new study created in memory with name: no-name-3f0a15c3-e978-4e06-8b7c-0aa8577ba1b3
[I 2025-06-17 16:27:09,792] Trial 0 finished with value: 0.6910016977928692 and parameters: {'n_estimators': 115, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.6910016977928692.
[I 2025-06-17 16:27:58,688] Trial 1 finished with value: 0.7028862478777589 and parameters: {'n_estimators': 117, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.7028862478777589.
[I 2025-06-17 16:28:05,639] Trial 2 finished with value: 0.688737973967176 and parameters: {'n_estimators': 117, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.7028862478777589.
[I 2025-06-17 16:29:37,384] Trial 3 finished with value: 0.6901528013582343 and parameters: {'n_estimators': 113, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}.

{'n_estimators': 120, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 'sqrt'} 0.7034521788341822


In [16]:
def objective(trial):
    C = trial.suggest_float("C", 1e-4, 10.0, log=True)
    penalty = trial.suggest_categorical("penalty", ["l2", "none"])
    solver = trial.suggest_categorical("solver", ["lbfgs", "saga", "newton-cg"])  # compatibles con L2 o none
    max_iter = trial.suggest_int("max_iter", 100, 1000)

    model = LogisticRegression(
        C=C,
        penalty=penalty,
        solver=solver,
        max_iter=max_iter,
        random_state=42,
    )

    lr_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("lr", model)
    ])
    lr_pipeline.fit(df['text'], df['sentiment_label'])
    score = lr_pipeline.score(test_df['text'], test_df['sentiment_label'])
    return score
    
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)
print(study.best_params, study.best_value)

[I 2025-06-17 17:29:21,505] A new study created in memory with name: no-name-02f30239-3c8a-452f-adef-9a058f5e6c22
[I 2025-06-17 17:29:30,210] Trial 0 finished with value: 0.6148839841539332 and parameters: {'C': 0.0017235671711406438, 'penalty': 'none', 'solver': 'saga', 'max_iter': 348}. Best is trial 0 with value: 0.6148839841539332.
[I 2025-06-17 17:29:30,886] Trial 1 finished with value: 0.4538766270514997 and parameters: {'C': 0.004205101930842154, 'penalty': 'l2', 'solver': 'newton-cg', 'max_iter': 242}. Best is trial 0 with value: 0.6148839841539332.
[I 2025-06-17 17:29:31,904] Trial 2 finished with value: 0.5826259196378042 and parameters: {'C': 0.02161583292570402, 'penalty': 'l2', 'solver': 'saga', 'max_iter': 439}. Best is trial 0 with value: 0.6148839841539332.
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the docume

KeyboardInterrupt: 

In [19]:
def objective(trial):
    n_neighbors = trial.suggest_int("n_neighbors", 3, 50)
    weights = trial.suggest_categorical("weights", ["uniform", "distance"])
    algorithm = trial.suggest_categorical("algorithm", ["auto", "ball_tree", "kd_tree", "brute"])
    p = trial.suggest_int("p", 1, 2)  # 1: manhattan, 2: euclidean
    leaf_size = trial.suggest_int("leaf_size", 10, 100)

    model = KNeighborsClassifier(
        n_neighbors=n_neighbors,
        weights=weights,
        algorithm=algorithm,
        p=p,
        leaf_size=leaf_size
    )

    knn_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("knn", model)
    ])
    knn_pipeline.fit(df['text'], df['sentiment_label'])
    score = knn_pipeline.score(test_df['text'], test_df['sentiment_label'])
    return score
    
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)
print(study.best_params, study.best_value)

[I 2025-06-17 17:45:13,066] A new study created in memory with name: no-name-123cb054-32da-45a1-9e4a-1f7ba643009b
[I 2025-06-17 17:45:26,708] Trial 0 finished with value: 0.6179966044142614 and parameters: {'n_neighbors': 42, 'weights': 'distance', 'algorithm': 'kd_tree', 'p': 2, 'leaf_size': 40}. Best is trial 0 with value: 0.6179966044142614.
[I 2025-06-17 17:45:40,476] Trial 1 finished with value: 0.5874363327674024 and parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'auto', 'p': 2, 'leaf_size': 35}. Best is trial 0 with value: 0.6179966044142614.
[I 2025-06-17 17:45:46,259] Trial 2 finished with value: 0.42105263157894735 and parameters: {'n_neighbors': 45, 'weights': 'distance', 'algorithm': 'ball_tree', 'p': 1, 'leaf_size': 58}. Best is trial 0 with value: 0.6179966044142614.
[I 2025-06-17 17:45:59,897] Trial 3 finished with value: 0.619128466327108 and parameters: {'n_neighbors': 48, 'weights': 'distance', 'algorithm': 'auto', 'p': 2, 'leaf_size': 41}. Best i

{'n_neighbors': 49, 'weights': 'distance', 'algorithm': 'auto', 'p': 2, 'leaf_size': 45} 0.6211092246745897


In [1]:
def objective(trial):
    alpha = trial.suggest_float("alpha", 1e-3, 10.0, log=True)
    fit_prior = trial.suggest_categorical("fit_prior", [True, False])

    model = MultinomialNB(alpha=alpha, fit_prior=fit_prior)
    
    nb_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("nb", model)
    ])
    nb_pipeline.fit(df['text'], df['sentiment_label'])
    score = nb_pipeline.score(test_df['text'], test_df['sentiment_label'])
    return score
    
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)
print(study.best_params, study.best_value)

NameError: name 'fit_prior' is not defined