<a href="https://colab.research.google.com/github/jarce2388/-/blob/main/Jorge_Arce_rnn_entregable.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import nltk
import pandas as pd
import numpy as np
pd.options.display.max_colwidth = 200
import random
import os

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
df = pd.read_csv('amazon_reviews_es.csv')
df.head()

Unnamed: 0,text,stars
0,television Nevir\n\nNada bueno se me fue ka pantalla en menos de 8 meses y no he recibido respuesta del fabricante,1
1,"Dinero tirado a la basura con esta compra\n\nHorrible, nos tuvimos que comprar otro porque ni nosotros que sabemos inglés, ni un informático, después de una hora fue capaz de instalarlo",1
2,"solo llega una unidad cuando te obligan a comprar dos\n\nTe obligan a comprar dos unidades y te llega solo una y no hay forma de reclamar, una autentica estafa, no compreis!!",1
3,"PRODUCTO NO RECIBIDO.\n\nNo entro en descalificar al vendedor, solo puedo decir que tras dos meses de espera.... sigo sin el producto y tuve que contactar con Amazon para reclamar su reembolso. Am...",1
4,Devuelto\n\nLlega tarde y co la talla equivocada,1


In [None]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize

In [None]:
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('tagsets')

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import string

spanish_stopwords = stopwords.words("spanish")
lemmatizer = WordNetLemmatizer()

In [None]:
len(spanish_stopwords)

In [None]:
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in spanish_stopwords]
    tokens = [w for w in tokens if len(w) >= 3]
    tokens = [w if w.isalpha() else 'NO-WORD' for w in tokens]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return tokens

In [None]:
df['text_cleaned'] = df['text'].apply(preprocess_text)

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    stop_words=spanish_stopwords,
    # max_features=10000,      # Limita a 10k palabras más relevantes
    # min_df=5,                 # Solo palabras que aparezcan en al menos 5 reseñas
    # ngram_range=(1, 2)        # Usa unigramas y bigramas
)

In [None]:
train_tfidf = tfidf.fit_transform(df_train['text'])
test_tfidf = tfidf.transform(df_test['text'])

In [None]:
vocabulario = np.array(tfidf.get_feature_names_out())
vocabulario

#### 1. MODELOS CLASICOS

SVM (Lineal)

In [None]:
from sklearn.svm import SVC, LinearSVC
# svc = SVC()
model_svc = LinearSVC()
model_svc.fit(train_tfidf, y=df_train['stars'])

In [None]:
df_train['stars_PRED'] = model_svc.predict(train_tfidf)
df_test['stars_PRED'] = model_svc.predict(test_tfidf)

accuracy_train = accuracy_score(df_train['stars'], df_train['stars_PRED'])
accuracy_test = accuracy_score(df_test['stars'], df_test['stars_PRED'])

print(f"<<< TRAIN: {accuracy_train}, TEST: {accuracy_test} >>>")
print(classification_report(df_train['stars'], df_train['stars_PRED']))
print(classification_report(df_test['stars'], df_test['stars_PRED']))

R: El modelo SVM no es óptimo para este tipo de problemática, además que demora más de 1 Hora en entrenar.

REGRESIÓN LOGÍSTICA

In [None]:
from sklearn.linear_model import LogisticRegression
model_lr = LogisticRegression( solver='saga', n_jobs=-1)

In [None]:
model_lr.fit(train_tfidf, y=df_train['stars'])

In [None]:
df_test['stars_PRED'] = model_lr.predict(test_tfidf)
df_train['stars_PRED'] = model_lr.predict(train_tfidf)

In [None]:
coeficientes = model_lr.coef_[0]
coeficientes

In [None]:
top_indices = np.argsort(coeficientes)[15:]  # Top 5 términos más relevantes
print("Palabras más influyentes en la clasificación:")
print(vocabulario[top_indices], coeficientes[top_indices])

In [None]:
df_test.sample(5)

In [None]:
df_train['stars'].value_counts(normalize=True)

In [None]:
accuracy_train = accuracy_score(df_train['stars'], df_train['stars_PRED'])
accuracy_train

In [None]:
print(classification_report(df_train['stars'], df_train['stars_PRED']))

In [None]:
accuracy_train = accuracy_score(df_test['stars'], df_test['stars_PRED'])
accuracy_train

In [None]:
print(classification_report(df_test['stars'], df_test['stars_PRED']))

#### Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

model_nb = MultinomialNB()
model_nb.fit(train_tfidf, y=df_train['stars'])

In [None]:
df_train['stars_PRED'] = model_nb.predict(train_tfidf)
df_test['stars_PRED'] = model_nb.predict(test_tfidf)

accuracy_train = accuracy_score(df_train['stars'], df_train['stars_PRED'])
accuracy_test = accuracy_score(df_test['stars'], df_test['stars_PRED'])

print(f"<<< TRAIN: {accuracy_train}, TEST: {accuracy_test} >>>")
print(classification_report(df_train['stars'], df_train['stars_PRED']))
print(classification_report(df_test['stars'], df_test['stars_PRED']))

In [None]:
from xgboost import XGBClassifier

model_xgb = XGBClassifier( objective="multi:softmax", num_class=5, n_jobs=-1)
model_xgb.fit(train_tfidf, y=df_train['stars']-1)

In [None]:
df_train['stars_PRED'] = model_xgb.predict(train_tfidf)
df_test['stars_PRED'] = model_xgb.predict(test_tfidf)

accuracy_train = accuracy_score(df_train['stars']-1, df_train['stars_PRED'])
accuracy_test = accuracy_score(df_test['stars']-1, df_test['stars_PRED'])

print(f"<<< TRAIN: {accuracy_train}, TEST: {accuracy_test} >>>")
print(classification_report(df_train['stars']-1, df_train['stars_PRED']))
print(classification_report(df_test['stars']-1, df_test['stars_PRED']))

RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(
    n_estimators=100,  # Número de árboles
    max_depth=15,      # Profundidad máxima para evitar sobreajuste
    n_jobs=-1,         # Usa todos los núcleos disponibles
    random_state=42
)

model_rf.fit(train_tfidf, df_train["stars"])
df_train['stars_PRED'] = model_rf.predict(train_tfidf)
df_test['stars_PRED'] = model_rf.predict(test_tfidf)


In [None]:
df_train['stars_PRED'] = model_rf.predict(train_tfidf)
df_test['stars_PRED'] = model_rf.predict(test_tfidf)

accuracy_train = accuracy_score(df_train['stars'], df_train['stars_PRED'])
accuracy_test = accuracy_score(df_test['stars'], df_test['stars_PRED'])

print(f"<<< TRAIN: {accuracy_train}, TEST: {accuracy_test} >>>")
print(classification_report(df_train['stars'], df_train['stars_PRED']))
print(classification_report(df_test['stars'], df_test['stars_PRED']))

####  LightGBM (LGBMClassifier)

In [None]:
from lightgbm import LGBMClassifier

model_lgbm = LGBMClassifier(
    boosting_type="gbdt",
    objective="multiclass",
    num_class=5,  # 5 clases
    n_estimators=200,
    max_depth=-1,  # Sin límite de profundidad
    n_jobs=-1
)

model_lgbm.fit(train_tfidf, df_train["stars"])


In [None]:
df_train['stars_PRED'] = model_lgbm.predict(train_tfidf)
df_test['stars_PRED'] = model_lgbm.predict(test_tfidf)

accuracy_train = accuracy_score(df_train['stars'], df_train['stars_PRED'])
accuracy_test = accuracy_score(df_test['stars'], df_test['stars_PRED'])

print(f"<<< TRAIN: {accuracy_train}, TEST: {accuracy_test} >>>")
print(classification_report(df_train['stars'], df_train['stars_PRED']))
print(classification_report(df_test['stars'], df_test['stars_PRED']))

### 2. FINE TUNING CON TRANSFORMERS

In [None]:
from transformers import pipeline

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, TFAutoModel, DataCollatorWithPadding
import tensorflow as tf

import json
from datasets import load_dataset
import random
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, TFAutoModel
import tensorflow as tf

In [None]:
# cm = confusion_matrix(df_train['stars']-1, df_train['stars_PRED'])
# sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
# plt.xlabel("Predicción")
# plt.ylabel("Real")
# plt.title("Matriz de Confusión - XGBoost")
# plt.show()