In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('../notebooks/data/imbd_dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [11]:
import re
import unicodedata

def limpiar_texto(texto):
    # Pasar a minúsculas
    texto = texto.lower()

    # Eliminar acentos
    texto = unicodedata.normalize('NFKD', texto).encode('ASCII', 'ignore').decode('utf-8')

    # Eliminar caracteres especiales y puntuación
    texto = re.sub(r'[^a-z0-9\s]', '', texto)

    # Eliminar espacios múltiples
    texto = re.sub(r'\s+', ' ', texto).strip()

    return texto


In [48]:
data['cleaned_review'] = data['review'].apply(limpiar_texto)
data.head()
data.drop(columns=['review'], inplace=True)

In [49]:
data.to_csv('../notebooks/data/imbd_dataset_cleaned.csv', index=False)

In [13]:
X = data.drop(columns=['sentiment', 'review'])
y = data['sentiment']

In [14]:
X

Unnamed: 0,cleaned_review
0,one of the other reviewers has mentioned that ...
1,a wonderful little production br br the filmin...
2,i thought this was a wonderful way to spend ti...
3,basically theres a family where a little boy j...
4,petter matteis love in the time of money is a ...
...,...
49995,i thought this movie did a down right good job...
49996,bad plot bad dialogue bad acting idiotic direc...
49997,i am a catholic taught in parochial elementary...
49998,im going to have to disagree with the previous...


In [15]:
y

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 50000, dtype: object

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [37]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape  

((40000, 1), (10000, 1), (40000,), (10000,))

In [38]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)

In [43]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


models = {
    "LogReg": LogisticRegression(max_iter=1000),
    "NB": MultinomialNB(),
    "RandomForest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

for name, clf in models.items():
    pipe = make_pipeline(TfidfVectorizer(), clf)
    pipe.fit(X_train.cleaned_review, y_train_enc)
    y_pred = pipe.predict(X_test.cleaned_review)
    print(f"\n{name} Results")
    print(classification_report(y_test_enc, y_pred))



LogReg Results
              precision    recall  f1-score   support

           0       0.91      0.89      0.90      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000


NB Results
              precision    recall  f1-score   support

           0       0.84      0.89      0.87      4961
           1       0.89      0.84      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.87      0.87      0.86     10000
weighted avg       0.87      0.86      0.86     10000


RandomForest Results
              precision    recall  f1-score   support

           0       0.84      0.86      0.85      4961
           1       0.85      0.84      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85 

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost Results
              precision    recall  f1-score   support

           0       0.88      0.85      0.86      4961
           1       0.86      0.88      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [50]:
X_train.cleaned_review

39087    thats what i kept asking myself during the man...
30893    i did not watch the entire movie i could not w...
45278    a touching love story reminiscent of in the mo...
16398    this latterday fulci schlocker is a totally ab...
13653    first of all i firmly believe that norwegian m...
                               ...                        
11284    shadow magic recaptures the joy and amazement ...
44732    i found this movie to be quite enjoyable and f...
38158    avoid this one it is a terrible movie so what ...
860      this production was quite a surprise for me i ...
15795    this is a decent movie although little bit sho...
Name: cleaned_review, Length: 40000, dtype: object

In [51]:
X_train['cleaned_review']

39087    thats what i kept asking myself during the man...
30893    i did not watch the entire movie i could not w...
45278    a touching love story reminiscent of in the mo...
16398    this latterday fulci schlocker is a totally ab...
13653    first of all i firmly believe that norwegian m...
                               ...                        
11284    shadow magic recaptures the joy and amazement ...
44732    i found this movie to be quite enjoyable and f...
38158    avoid this one it is a terrible movie so what ...
860      this production was quite a surprise for me i ...
15795    this is a decent movie although little bit sho...
Name: cleaned_review, Length: 40000, dtype: object

In [44]:
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

def objective(trial):
    # Hiperparámetros a optimizar
    c = trial.suggest_loguniform('C', 1e-4, 10.0)
    max_df = trial.suggest_float('max_df', 0.7, 1.0)
    min_df = trial.suggest_int('min_df', 1, 10)
    ngram_range = trial.suggest_categorical('ngram_range', [(1,1), (1,2)])

    # Pipeline con hiperparámetros de Tfidf y LogisticRegression
    pipeline = make_pipeline(
        TfidfVectorizer(
            max_df=max_df,
            min_df=min_df,
            ngram_range=ngram_range
        ),
        LogisticRegression(C=c, max_iter=1000)
    )

    # Validación cruzada (puedes usar más folds si quieres)
    score = cross_val_score(
        pipeline,
        X_train.cleaned_review,
        y_train_enc,
        cv=3,
        scoring='f1',  # usa 'accuracy', 'f1', 'recall', etc. según objetivo
        n_jobs=-1
    ).mean()

    return score


  from .autonotebook import tqdm as notebook_tqdm


In [45]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2025-05-14 15:03:43,852] A new study created in memory with name: no-name-e91c9780-0b4d-4123-ad03-cbd5a115a18a
  c = trial.suggest_loguniform('C', 1e-4, 10.0)
[I 2025-05-14 15:04:04,854] Trial 0 finished with value: 0.8800379122079649 and parameters: {'C': 0.2271340425350872, 'max_df': 0.7217463730690882, 'min_df': 10, 'ngram_range': (1, 2)}. Best is trial 0 with value: 0.8800379122079649.
  c = trial.suggest_loguniform('C', 1e-4, 10.0)
[I 2025-05-14 15:04:11,882] Trial 1 finished with value: 0.894164508467271 and parameters: {'C': 8.650955206441287, 'max_df': 0.9372296122625206, 'min_df': 9, 'ngram_range': (1, 1)}. Best is trial 1 with value: 0.894164508467271.
  c = trial.suggest_loguniform('C', 1e-4, 10.0)
[I 2025-05-14 15:04:29,584] Trial 2 finished with value: 0.7910421007572385 and parameters: {'C': 0.001461109066970884, 'max_df': 0.8112445810016415, 'min_df': 3, 'ngram_range': (1, 2)}. Best is trial 1 with value: 0.894164508467271.
  c = trial.suggest_loguniform('C', 1e-4, 10

In [46]:
best_params = study.best_params
print("Mejores hiperparámetros:", best_params)

# Construir el mejor pipeline
best_pipeline = make_pipeline(
    TfidfVectorizer(
        max_df=best_params['max_df'],
        min_df=best_params['min_df'],
        ngram_range=best_params['ngram_range']
    ),
    LogisticRegression(
        C=best_params['C'],
        max_iter=1000
    )
)

# Entrenar y evaluar
best_pipeline.fit(X_train.cleaned_review, y_train_enc)
y_pred = best_pipeline.predict(X_test.cleaned_review)
print("\nResultados finales con mejores hiperparámetros:")
print(classification_report(y_test_enc, y_pred))


Mejores hiperparámetros: {'C': 9.825403241673984, 'max_df': 0.9203706638974369, 'min_df': 6, 'ngram_range': (1, 2)}

Resultados finales con mejores hiperparámetros:
              precision    recall  f1-score   support

           0       0.92      0.91      0.92      4961
           1       0.91      0.93      0.92      5039

    accuracy                           0.92     10000
   macro avg       0.92      0.92      0.92     10000
weighted avg       0.92      0.92      0.92     10000

