# Universidad Autonoma de Aguascalientes
# Departamento: Ciencias de la Computación
# Materia: Machine y Deep Learning
# Profesor: Dr. Francisco Javier Luna Rosas
# Alumnos: 
# Enrique Vélez Durán
# Gabriel Melchor Campos
# Carlos Fernando Nájera Ruiz
# Cristián Israel Donato Flores
#### Semestre: Enero-Junio 2025
## Críticas de Cine
## 

## Importación de Librerías

In [8]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression

## Descargar recursos necesarios NLTK
## Intentar utilizar la GPU en caso de exisitir

In [9]:
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\velez\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\velez\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\velez\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\velez\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Cargar Y Preprocesamiento del Dataset
## Creación de vocabulario a partir de los tokens guardados de las críticas

In [10]:
df = pd.read_csv('movie_data.csv')

def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

df['tokens'] = df['review'].apply(preprocess_text)
df['processed_review'] = df['tokens'].apply(lambda tokens: ' '.join(tokens))

def create_vocabulary(token_lists):
    word_counts = Counter()
    for tokens in token_lists:
        word_counts.update(tokens)
    return {word: idx for idx, (word, _) in enumerate(word_counts.most_common())}

vocabulary = create_vocabulary(df['tokens'])
vocab_size = len(vocabulary)
embedding_dim = 100

## Implementación del Modelo Skip-Gram

In [11]:
class SkipGramDataset(Dataset):
    def __init__(self, token_lists, vocabulary, window_size=2, negative_samples=5):
        self.data = []
        self.vocab_size = len(vocabulary)
        self.window_size = window_size
        self.negative_samples = negative_samples
        self.vocabulary = vocabulary
        # Para cada crítica...
        for tokens in token_lists:
            # Convertir tokens a IDs
            token_ids = [vocabulary[token] for token in tokens if token in vocabulary]
            for i, center in enumerate(token_ids):
                start = max(0, i - window_size)
                end = min(len(token_ids), i + window_size + 1)
                for j in range(start, end):
                    if i == j:
                        continue
                    context = token_ids[j]
                    # Par positivo
                    self.data.append((center, context, 1))
                    # Generar muestras negativas (se elige aleatoriamente un índice del vocabulario)
                    for _ in range(negative_samples):
                        negative = np.random.randint(0, self.vocab_size)
                        self.data.append((center, negative, 0))
                        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        center, context, label = self.data[idx]
        return (torch.tensor(center, dtype=torch.long),
                torch.tensor(context, dtype=torch.long),
                torch.tensor(label, dtype=torch.float))

# Crear dataset y DataLoader para Skip-Gram
skipgram_dataset = SkipGramDataset(df['tokens'], vocabulary, window_size=2, negative_samples=5)
batch_size_sg = 256
skipgram_loader = DataLoader(skipgram_dataset, batch_size=batch_size_sg, shuffle=True)

## Modelo Skip-Gram: Dos embeddings (entrada y sálida)

In [12]:
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.in_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.out_embeddings = nn.Embedding(vocab_size, embedding_dim)
    
    def forward(self, center_words, context_words):
        center_embeds = self.in_embeddings(center_words)      
        context_embeds = self.out_embeddings(context_words)   
        # Producto punto para cada par
        scores = torch.sum(center_embeds * context_embeds, dim=1)
        return scores

skipgram_model = SkipGramModel(vocab_size, embedding_dim).to(device)
skipgram_criterion = nn.BCEWithLogitsLoss()
skipgram_optimizer = optim.Adam(skipgram_model.parameters(), lr=0.001)

## Entrenamiento del modelo Skip-Gram

In [13]:
skipgram_epochs = 5
for epoch in range(skipgram_epochs):
    total_loss = 0
    for center, context, label in skipgram_loader:
        center = center.to(device)
        context = context.to(device)
        label = label.to(device)
        skipgram_optimizer.zero_grad()
        outputs = skipgram_model(center, context)
        loss = skipgram_criterion(outputs, label)
        loss.backward()
        skipgram_optimizer.step()
        total_loss += loss.item()
    print(f"SkipGram Epoch {epoch+1}/{skipgram_epochs}, Loss: {total_loss/len(skipgram_loader):.4f}")

# Extraer la matriz de embeddings entrenada (de la capa de entrada)
embeddings = skipgram_model.in_embeddings.weight.data.cpu().numpy()

# Función para representar cada crítica como el promedio de los embeddings de sus palabras
def get_review_embedding(tokens, vocabulary, embeddings):
    token_ids = [vocabulary[token] for token in tokens if token in vocabulary]
    if len(token_ids) == 0:
        return np.zeros(embedding_dim)
    else:
        return np.mean(embeddings[token_ids], axis=0)

df['review_embedding'] = df['tokens'].apply(lambda tokens: get_review_embedding(tokens, vocabulary, embeddings))

KeyboardInterrupt: 

## Modelo de Red Neuronal (NNBP) Para Clasificación de Sentimientos

In [None]:
# Crear características y etiquetas
X = np.vstack(df['review_embedding'].values)
y = df['sentiment'].values  # Se asume que 'sentiment' es 0 (negativo) o 1 (positivo)

# Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convertir a tensores para PyTorch
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1).to(device)

# Dataset y DataLoader para clasificación
class ReviewDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

batch_size_cls = 32
train_dataset = ReviewDataset(X_train_tensor, y_train_tensor)
test_dataset = ReviewDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size_cls, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size_cls, shuffle=False)

## Definición del Modelo de Red Neuroanl

In [None]:
class NNBPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(NNBPClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return self.sigmoid(x)

hidden_dim = 64
classifier = NNBPClassifier(embedding_dim, hidden_dim).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(classifier.parameters(), lr=0.001)
# Entrenamiento de la red neuronal con mini-batches
classifier_epochs = 10
for epoch in range(classifier_epochs):
    classifier.train()
    total_loss = 0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = classifier(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Classifier Epoch {epoch+1}/{classifier_epochs}, Loss: {total_loss/len(train_loader):.4f}')

# Evaluación del modelo neuronal
classifier.eval()
all_preds = []
all_true = []
with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = classifier(batch_X)
        preds = (outputs >= 0.5).float()
        all_preds.append(preds.cpu().numpy())
        all_true.append(batch_y.cpu().numpy())
y_pred_nn = np.vstack(all_preds)
y_true = np.vstack(all_true)
accuracy_nn = accuracy_score(y_true, y_pred_nn)
print(f'Neural Network Accuracy: {accuracy_nn:.4f}')
print(classification_report(y_true, y_pred_nn))
cm_nn = confusion_matrix(y_true, y_pred_nn)
tn_nn, fp_nn, fn_nn, tp_nn = cm_nn.ravel()
metrics_nn = {
    'Accuracy': accuracy_nn,
    'Specificity': tn_nn / (tn_nn + fp_nn) if (tn_nn+fp_nn)>0 else 0,
    'Sensitivity': tp_nn / (tp_nn + fn_nn) if (tp_nn+fn_nn)>0 else 0,
    'Precision': precision_score(y_true, y_pred_nn),
    'False Positive Rate': fp_nn / (fp_nn + tn_nn) if (fp_nn+tn_nn)>0 else 0,
    'False Negative Rate': fn_nn / (fn_nn + tp_nn) if (fn_nn+tp_nn)>0 else 0,
    'Positive Predictive Value': tp_nn / (tp_nn + fp_nn) if (tp_nn+fp_nn)>0 else 0,
    'Negative Predictive Value': tn_nn / (tn_nn + fn_nn) if (tn_nn+fn_nn)>0 else 0
}

## Clasificador Tradicional: Regresión Logística

In [None]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f'Logistic Regression Accuracy: {accuracy_lr:.4f}')
print(classification_report(y_test, y_pred_lr))
cm_lr = confusion_matrix(y_test, y_pred_lr)
tn_lr, fp_lr, fn_lr, tp_lr = cm_lr.ravel()
metrics_lr = {
    'Accuracy': accuracy_lr,
    'Specificity': tn_lr / (tn_lr + fp_lr) if (tn_lr+fp_lr)>0 else 0,
    'Sensitivity': tp_lr / (tp_lr + fn_lr) if (tp_lr+fn_lr)>0 else 0,
    'Precision': precision_score(y_test, y_pred_lr),
    'False Positive Rate': fp_lr / (fp_lr + tn_lr) if (fp_lr+tn_lr)>0 else 0,
    'False Negative Rate': fn_lr / (fn_lr + tp_lr) if (fn_lr+tp_lr)>0 else 0,
    'Positive Predictive Value': tp_lr / (tp_lr + fp_lr) if (tp_lr+fp_lr)>0 else 0,
    'Negative Predictive Value': tn_lr / (tn_lr + fn_lr) if (tn_lr+fn_lr)>0 else 0
}

## Visualización Comparativa

In [None]:
metrics_names = list(metrics_nn.keys())
nn_values = list(metrics_nn.values())
lr_values = [metrics_lr[m] for m in metrics_names]

x = np.arange(len(metrics_names))
width = 0.35

plt.figure(figsize=(12, 6))
plt.bar(x - width/2, nn_values, width, label='Neural Network', color='skyblue')
plt.bar(x + width/2, lr_values, width, label='Logistic Regression', color='salmon')

plt.xlabel('Metrics')
plt.ylabel('Score')
plt.title('Comparative Performance Metrics')
plt.xticks(x, metrics_names, rotation=45)
plt.legend()

# Añadir etiquetas de valor en cada barra
for i, v in enumerate(nn_values):
    plt.text(i - width/2, v, f'{v:.3f}', ha='center', va='bottom')
for i, v in enumerate(lr_values):
    plt.text(i + width/2, v, f'{v:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Matriz de confusión para NN
plt.figure(figsize=(8,6))
sns.heatmap(cm_nn, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix - Neural Network')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

# Matriz de confusión para Regresión Logística
plt.figure(figsize=(8,6))
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Reds',
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix - Logistic Regression')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

# Imprimir métricas detalladas
print("\nDetailed Metrics for Neural Network Model:")
for metric, value in metrics_nn.items():
    print(f"{metric}: {value:.3f}")

print("\nDetailed Metrics for Logistic Regression Model:")
for metric, value in metrics_lr.items():
    print(f"{metric}: {value:.3f}")