In [2]:
import nltk
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [3]:
import pandas as pd
import numpy as np

In [4]:
reviews = [(list(movie_reviews.words(fileid)), category)
           for category in movie_reviews.categories()
           for fileid in movie_reviews.fileids(category)]
df = pd.DataFrame(reviews, columns=['review', 'sentiment'])

In [5]:
df['review'] = df['review'].apply(lambda x: ' '.join(x))

In [6]:
df['sentiment'] = df['sentiment'].map({'pos': 1, 'neg': 0})

In [7]:
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [15]:
def preprocess_text(text):
    # Eliminar caracteres especiales
    text = re.sub(r'\W', ' ', text)
    # Convertir a minúsculas
    text = text.lower()
    # Eliminar palabras de parada
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['review'] = df['review'].apply(preprocess_text)



In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=21)

In [19]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_clf = RandomForestClassifier(random_state=21)
rf_clf.fit(X_train_tfidf, y_train)
y_pred_rf = rf_clf.predict(X_test_tfidf)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Random Forest Accuracy: {accuracy_rf}')

Random Forest Accuracy: 0.8025


In [21]:
from sklearn.svm import SVC

svm_clf = SVC(random_state=21)
svm_clf.fit(X_train_tfidf, y_train)
y_pred_svm = svm_clf.predict(X_test_tfidf)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f'SVM Accuracy: {accuracy_svm}')

SVM Accuracy: 0.8275


In [22]:
import nltk
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews

import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Cargar datos
reviews = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        reviews.append((movie_reviews.raw(fileid), category))

df = pd.DataFrame(reviews, columns=['review', 'sentiment'])
df['sentiment'] = df['sentiment'].map({'pos': 1, 'neg': 0})

# Limpieza de datos
def clean_text(text):
    text = re.sub(r'<[^>]*>', '', text)
    text = re.sub(r'[\W]+', ' ', text.lower())
    return text

df['review'] = df['review'].apply(clean_text)

# Vectorización
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)
X = tfidf.fit_transform(df['review'])
y = df['sentiment']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Entrenar el modelo
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Hacer predicciones
y_pred = rf.predict(X_test)

# Calcular la precisión
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

if accuracy > 0.82:
    print("Nota aprobatoria: La precisión es mayor al 82%")
else:
    print("La precisión es menor al 82%")

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Accuracy: 0.76
La precisión es menor al 82%
