### A. Preprocesamiento de Datos

In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

# Descargar recursos necesarios
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Cargar dataset
df = pd.read_csv('movie_data.csv')

def preprocess_text(text):
    # Eliminar HTML
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Convertir a minúsculas
    text = text.lower()

    # Eliminar caracteres especiales
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenización
    tokens = word_tokenize(text)

    # Eliminar stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lematización
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

# Aplicar preprocesamiento
df['processed_review'] = df['review'].apply(preprocess_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\velez\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\velez\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\velez\AppData\Roaming\nltk_data...
  text = BeautifulSoup(text, 'html.parser').get_text()


### B. Implementación del modelo Skip-gram

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter

class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        out = self.linear(embeds)
        return out

# Crear vocabulario
def create_vocabulary(processed_texts):
    word_counts = Counter()
    for text in processed_texts:
        word_counts.update(text.split())
    return {word: idx for idx, (word, _) in enumerate(word_counts.most_common())}

vocabulary = create_vocabulary(df['processed_review'])
vocab_size = len(vocabulary)
embedding_dim = 100

# Inicializar modelo
skip_gram = SkipGramModel(vocab_size, embedding_dim)

### C. Red Neuronal con Propagación hacia Atrás (NNBP)

In [3]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(TextClassifier, self).__init__()
        self.embedding = skip_gram.embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = lstm_out[:, -1, :]
        out = self.fc(lstm_out)
        return self.sigmoid(out)

# Parámetros
hidden_dim = 64
model = TextClassifier(vocab_size, embedding_dim, hidden_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())

### D. Análisis Comparativo con SVM

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

# Vectorización TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(df['processed_review'])

# Dividir datos
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['sentiment'])

# Entrenar SVM
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

# Evaluación
svm_predictions = svm_classifier.predict(X_test)
print("Reporte de clasificación SVM:")
print(classification_report(y_test, svm_predictions))

Reporte de clasificación SVM:
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      6213
           1       0.88      0.89      0.88      6287

    accuracy                           0.88     12500
   macro avg       0.88      0.88      0.88     12500
weighted avg       0.88      0.88      0.88     12500

