# Sentiment Classification

## Dataset Feature Extraction

In [2]:
import torch
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
# print(torch.cuda.get_device_name(0))

Using device: cpu


In [3]:
path = "./"

In [4]:
import pandas as pd

train_data = pd.read_csv(path + "train.csv")
test_data = pd.read_csv(path + "test.csv")

print(train_data.shape)
print(test_data.shape)
train_data.head()

(25000, 2)
(25000, 1)


Unnamed: 0,message,label
0,I saw this movie in NEW York city. I was waiti...,neg
1,This is a German film from 1974 that is someth...,neg
2,I attempted watching this movie twice and even...,neg
3,On his birthday a small boys tells his mother ...,neg
4,"The person who wrote the review ""enough with t...",pos


In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jeffr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jeffr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\jeffr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalpha() and token.lower() not in stop_words]
    return ' '.join(tokens)

In [7]:
train_data['tokens'] = train_data['message'].apply(preprocess_text)
test_data['tokens'] = test_data['message'].apply(preprocess_text)

In [8]:
train_data.head()

Unnamed: 0,message,label,tokens
0,I saw this movie in NEW York city. I was waiti...,neg,saw movie new york city waiting bus next morni...
1,This is a German film from 1974 that is someth...,neg,german film something woman come castle beyond...
2,I attempted watching this movie twice and even...,neg,attempted watching movie twice even fast forwa...
3,On his birthday a small boys tells his mother ...,neg,birthday small boy tell mother son want go hom...
4,"The person who wrote the review ""enough with t...",pos,person wrote review enough sweating spitting a...


In [9]:
test_data.head()

Unnamed: 0,message,tokens
0,Acclaimed Argentine horror director Emilio Vie...,acclaimed argentine horror director emilio vie...
1,I don't know if it's fair for me to review thi...,know fair review fan gratuitous violence never...
2,The only good thing about Persepolis is the sh...,good thing persepolis shadow created german an...
3,I completely forgot that I'd seen this within ...,completely forgot seen within couple day prett...
4,B. Kennedy tried to make a sequel by exaggerat...,kennedy tried make sequel exaggerating gargant...


In [24]:
max_len = max(train_data['tokens'].apply(lambda text: len(text.split())))
mean_len = int(round(np.mean(train_data['tokens'].apply(lambda text: len(text.split())))))

print(f'Max length: {max_len}')
print(f'Mean length: {mean_len}')

Max length: 1421
Mean length: 119


In [34]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['label'])

In [35]:
train_data.head()

Unnamed: 0,message,label,tokens,vectors,padded_vectors
0,I saw this movie in NEW York city. I was waiti...,0,saw movie new york city waiting bus next morni...,"[[-0.19189617, -0.028692013, -0.32174656, -0.2...","[[tensor(-0.1919), tensor(-0.0287), tensor(-0...."
1,This is a German film from 1974 that is someth...,0,german film something woman come castle beyond...,"[[-0.47960114, 0.5583763, -0.1370875, -0.44397...","[[tensor(-0.4796), tensor(0.5584), tensor(-0.1..."
2,I attempted watching this movie twice and even...,0,attempted watching movie twice even fast forwa...,"[[-0.13749087, 0.18824586, 0.024570609, 0.1324...","[[tensor(-0.1375), tensor(0.1882), tensor(0.02..."
3,On his birthday a small boys tells his mother ...,0,birthday small boy tell mother son want go hom...,"[[0.117996156, 0.49617633, -0.494434, -0.36352...","[[tensor(0.1180), tensor(0.4962), tensor(-0.49..."
4,"The person who wrote the review ""enough with t...",1,person wrote review enough sweating spitting a...,"[[0.048615, 0.3482453, -0.36405486, 0.4022355,...","[[tensor(0.0486), tensor(0.3482), tensor(-0.36..."


### Word Embedding (Word2Vec)

In [13]:
from gensim.models import Word2Vec

sentences = train_data['tokens'].apply(lambda x: x.split()).to_list()

word2vec = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, sg=1)

In [15]:
# Guardar el modelo
word2vec.save(path + "word2vec_model")

# Cargar el modelo
# word2vec = Word2Vec.load("word2vec_model")

In [14]:
def sentence_to_vectors(sentence, model, vector_size=100):
    vectors = []
    for word in sentence.split():
        if word in model.wv:
            vectors.append(model.wv[word])
        else:
            vectors.append([0] * vector_size)
    return vectors

In [15]:
train_data['vectors'] = train_data['tokens'].apply(lambda x: sentence_to_vectors(x, word2vec))

In [21]:
train_data['vectors'] = train_data['vectors'].apply(np.array)

In [37]:
train_data.head()

Unnamed: 0,message,label,tokens,vectors,padded_vectors
0,I saw this movie in NEW York city. I was waiti...,0,saw movie new york city waiting bus next morni...,"[[-0.19189617, -0.028692013, -0.32174656, -0.2...","[[tensor(-0.1919), tensor(-0.0287), tensor(-0...."
1,This is a German film from 1974 that is someth...,0,german film something woman come castle beyond...,"[[-0.47960114, 0.5583763, -0.1370875, -0.44397...","[[tensor(-0.4796), tensor(0.5584), tensor(-0.1..."
2,I attempted watching this movie twice and even...,0,attempted watching movie twice even fast forwa...,"[[-0.13749087, 0.18824586, 0.024570609, 0.1324...","[[tensor(-0.1375), tensor(0.1882), tensor(0.02..."
3,On his birthday a small boys tells his mother ...,0,birthday small boy tell mother son want go hom...,"[[0.117996156, 0.49617633, -0.494434, -0.36352...","[[tensor(0.1180), tensor(0.4962), tensor(-0.49..."
4,"The person who wrote the review ""enough with t...",1,person wrote review enough sweating spitting a...,"[[0.048615, 0.3482453, -0.36405486, 0.4022355,...","[[tensor(0.0486), tensor(0.3482), tensor(-0.36..."


In [23]:
from torch.nn.utils.rnn import pad_sequence
import torch
import numpy as np

def pad_sentences(vectors, max_len, vector_size=100):
    if len(vectors) > max_len:
        vectors = vectors[:max_len]
    else:
        padding = np.zeros((max_len - len(vectors), vector_size))
        vectors = np.vstack([vectors, padding])
    return torch.tensor(vectors, dtype=torch.float32)


In [25]:
train_data['padded_vectors'] = train_data['vectors'].apply(lambda x: pad_sentences(np.array(x), mean_len, word2vec.vector_size))

In [39]:
train_data.head()

Unnamed: 0,message,label,tokens,vectors,padded_vectors
0,I saw this movie in NEW York city. I was waiti...,0,saw movie new york city waiting bus next morni...,"[[-0.19189617, -0.028692013, -0.32174656, -0.2...","[[tensor(-0.1919), tensor(-0.0287), tensor(-0...."
1,This is a German film from 1974 that is someth...,0,german film something woman come castle beyond...,"[[-0.47960114, 0.5583763, -0.1370875, -0.44397...","[[tensor(-0.4796), tensor(0.5584), tensor(-0.1..."
2,I attempted watching this movie twice and even...,0,attempted watching movie twice even fast forwa...,"[[-0.13749087, 0.18824586, 0.024570609, 0.1324...","[[tensor(-0.1375), tensor(0.1882), tensor(0.02..."
3,On his birthday a small boys tells his mother ...,0,birthday small boy tell mother son want go hom...,"[[0.117996156, 0.49617633, -0.494434, -0.36352...","[[tensor(0.1180), tensor(0.4962), tensor(-0.49..."
4,"The person who wrote the review ""enough with t...",1,person wrote review enough sweating spitting a...,"[[0.048615, 0.3482453, -0.36405486, 0.4022355,...","[[tensor(0.0486), tensor(0.3482), tensor(-0.36..."


In [42]:
len(train_data['tokens'][2].split())

129

In [21]:
train_data.to_csv(path + "train_data_preprocessed.csv", index=False)

### TF-IDF Vectorizer

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2), lowercase=True)
X_tfidf = vectorizer.fit_transform(train_data['tokens']).toarray()

print(X_tfidf.shape)

(25000, 5000)


## Models


In [28]:
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=1):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        #x = x.unsqueeze(1)  # (batch_size, 1, input_dim)
        _, (hidden, _) = self.lstm(x)  # Solo usamos el estado oculto final
        hidden = hidden[-1]  # Si hay múltiples capas, tomamos la última
        out = self.fc(hidden)
        return out


In [32]:
import torch.nn as nn

class CNN(nn.Module):
    def __init__(self, input_dim, output_dim, kernel_size=3, stride=1, padding=1, layers=1, batch_norm=False, dropout=False, mlp_layers=2, activation='relu', output_type='sigmoid'):
        super(CNN, self).__init__()
        self.layers = layers
        self.conv_blocks = nn.ModuleList()
        in_channels = input_dim
        out_channels = output_dim

        # Seleccionar la función de activación
        if activation == 'relu':
            self.activation_fn = nn.ReLU()
        elif activation == 'leaky_relu':
            self.activation_fn = nn.LeakyReLU(0.1)
        elif activation == 'tanh':
            self.activation_fn = nn.Tanh()
        else:
            raise ValueError(f"Activation {activation} not supported")

        # Seleccionar el tipo de salida
        if output_type == 'sigmoid':
            self.output_fn = nn.Sigmoid()
        elif output_type == 'linear':
            self.output_fn = nn.Identity()  # Salida sin función de activación
        else:
            raise ValueError(f"Output type {output_type} not supported")

        # Capas convolucionales
        for i in range(layers):
            self.conv_blocks.append(
                nn.Sequential(
                    nn.Conv1d(
                        in_channels=in_channels,
                        out_channels=out_channels,
                        kernel_size=kernel_size,
                        stride=stride,
                        padding=padding
                    ),
                    self.activation_fn,
                    nn.BatchNorm1d(out_channels) if batch_norm else nn.Identity(),
                    nn.Dropout(0.3) if dropout else nn.Identity(),
                    nn.MaxPool1d(kernel_size=2)
                )
            )
            in_channels = out_channels
            out_channels *= 2

        # Capas MLP configurables
        self.mlp_layers = mlp_layers
        self.fc_layers = nn.ModuleList()

    def forward(self, x):
        x = x.permute(0, 2, 1)  # Reorganiza las dimensiones a [batch_size, input_dim, sequence_length]

        # Pasar por las capas convolucionales
        for block in self.conv_blocks:
            x = block(x)

        # Asegurar que las capas MLP se inicialicen correctamente
        if not self.fc_layers:
            final_output_dim = x.size(1) * x.size(2)  # canales_finales * longitud_final
            input_dim = final_output_dim
            for _ in range(self.mlp_layers - 1):
                self.fc_layers.append(nn.Linear(input_dim, input_dim // 2))
                self.fc_layers.append(self.activation_fn)
                input_dim //= 2
            self.fc_layers.append(nn.Linear(input_dim, 1))  # Última capa
            self.fc_layers = nn.ModuleList(self.fc_layers).to(x.device)

        x = x.view(x.size(0), -1)  # Aplana para la capa completamente conectada

        # Pasar por las capas MLP
        for layer in self.fc_layers:
            x = layer(x)

        x = self.output_fn(x)  # Aplicar la función de salida seleccionada
        return x


### Word2Vec

In [29]:
X = torch.stack(train_data['padded_vectors'].tolist())
y = torch.tensor(train_data['label'].values)

In [36]:
print(X.shape)
print(y.shape)

torch.Size([25000, 119, 100])
torch.Size([25000])


In [31]:
from torch.utils.data import DataLoader, TensorDataset, random_split

# Crear un dataset de PyTorch
dataset = TensorDataset(X, y)

# Dividir en entrenamiento y prueba
train_size = int(0.8 * len(dataset))  # 80% para entrenamiento
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Crear DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Inicializar el modelo, la función de pérdida y el optimizador
input_dim = word2vec.vector_size
hidden_dim = 128
output_dim = 2
model = LSTMClassifier(input_dim, hidden_dim, output_dim)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Entrenamiento
epochs = 20
losses = []
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for inputs, labels in train_loader:
        # inputs, labels = inputs.to(device), labels.to(device)
        labels = labels.long()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)
    losses.append(avg_loss)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss}")

Epoch 1/20, Loss: 0.664803598499298
Epoch 2/20, Loss: 0.6548450751304626
Epoch 3/20, Loss: 0.6547182178020478
Epoch 4/20, Loss: 0.4216099470734596
Epoch 5/20, Loss: 0.33870818737745284
Epoch 6/20, Loss: 0.3217929998636246
Epoch 7/20, Loss: 0.3096813480257988
Epoch 8/20, Loss: 0.3064695837497711
Epoch 9/20, Loss: 0.2974167529821396
Epoch 10/20, Loss: 0.29189534715414045
Epoch 11/20, Loss: 0.2886060152411461
Epoch 12/20, Loss: 0.3598038797259331
Epoch 13/20, Loss: 0.3541577285885811
Epoch 14/20, Loss: 0.26870052126646043
Epoch 15/20, Loss: 0.2629007907927036
Epoch 16/20, Loss: 0.2556140527009964
Epoch 17/20, Loss: 0.2469605266213417
Epoch 18/20, Loss: 0.2379913802653551
Epoch 19/20, Loss: 0.22532076367139817
Epoch 20/20, Loss: 0.21644155465364456


In [None]:
from itertools import product
import torch

# Asegúrate de que estás utilizando la GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Parámetros del experimento
layers_range = range(3, 6)  # De 3 a 6 capas convolucionales
batch_norm_options = [True, False]
dropout_options = [True, False]
mlp_layer_range = range(2, 6)  # De 2 a 3 capas en el MLP final
activations = ['relu', 'leaky_relu', 'tanh']  # Funciones de activación
output_types = ['sigmoid', 'linear']  # Tipos de salida

# Generar todas las combinaciones de parámetros
parameter_combinations = list(product(layers_range, batch_norm_options, dropout_options, mlp_layer_range, activations, output_types))

results = []

# Loop por cada combinación de parámetros
for layers, batch_norm, dropout, mlp_layers, activation, output_type in parameter_combinations:
    print(f"Training with layers={layers}, batch_norm={batch_norm}, dropout={dropout}, mlp_layers={mlp_layers}, activation={activation}, output_type={output_type}")

    # Inicializar modelo
    model = CNN(input_dim=word2vec.vector_size, output_dim=16, layers=layers, batch_norm=batch_norm, dropout=dropout, mlp_layers=mlp_layers, activation=activation, output_type=output_type)
    model = model.to(device)  # Mueve el modelo a la GPU

    # Configurar criterio de pérdida y optimizador
    criterion = nn.MSELoss()  # Pérdida para regresión
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Entrenamiento
    epochs = 20
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        for inputs, labels in train_loader:
            labels = labels.float().to(device)  # Mueve las etiquetas a la GPU
            inputs = inputs.float().to(device)  # Mueve los datos a la GPU
            outputs = model(inputs).squeeze()  # Asegúrate de que los outputs sean del mismo tamaño que las etiquetas
            loss = criterion(outputs, labels)  # Calcula la pérdida
            optimizer.zero_grad()
            loss.backward()  # Backward pass
            optimizer.step()  # Actualización del optimizador
            epoch_loss += loss.item()

    # Evaluar el modelo
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.float().to(device)  # Mueve los datos a la GPU
            labels = labels.to(device).float()  # Asegúrate de que las etiquetas sean float
            outputs = model(inputs).squeeze()
            predicted = (outputs >= 0.5).float()  # Predicciones binarizadas
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    print(f"Accuracy: {accuracy * 100:.2f}%")

    # Guardar resultados
    results.append({
        "layers": layers,
        "batch_norm": batch_norm,
        "dropout": dropout,
        "mlp_layers": mlp_layers,
        "activation": activation,
        "output_type": output_type,
        "accuracy": accuracy
    })

# Ordenar resultados por precisión
sorted_results = sorted(results, key=lambda x: x["accuracy"], reverse=True)

# Imprimir mejores resultados
print("\nTop Results:")
for result in sorted_results[:5]:  # Los 5 mejores
    print(result)


Using device: cuda
Training with layers=3, batch_norm=True, dropout=True, mlp_layers=2, activation=relu, output_type=sigmoid
Accuracy: 86.98%
Training with layers=3, batch_norm=True, dropout=True, mlp_layers=2, activation=relu, output_type=linear
Accuracy: 85.28%
Training with layers=3, batch_norm=True, dropout=True, mlp_layers=2, activation=leaky_relu, output_type=sigmoid
Accuracy: 86.50%
Training with layers=3, batch_norm=True, dropout=True, mlp_layers=2, activation=leaky_relu, output_type=linear
Accuracy: 85.02%
Training with layers=3, batch_norm=True, dropout=True, mlp_layers=2, activation=tanh, output_type=sigmoid
Accuracy: 86.80%
Training with layers=3, batch_norm=True, dropout=True, mlp_layers=2, activation=tanh, output_type=linear
Accuracy: 85.52%
Training with layers=3, batch_norm=True, dropout=True, mlp_layers=3, activation=relu, output_type=sigmoid
Accuracy: 86.38%
Training with layers=3, batch_norm=True, dropout=True, mlp_layers=3, activation=relu, output_type=linear
Accura

In [33]:
# Evaluar el modelo
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 86.04%


### TfidfVectorizer

In [None]:
X_tensor = torch.tensor(X_tfidf, dtype=torch.float32)
y_tensor = torch.tensor(train_data['label'].values)

In [None]:
from torch.utils.data import DataLoader, TensorDataset, random_split

# Crear un dataset de PyTorch
dataset = TensorDataset(X_tensor, y_tensor)

# Dividir en entrenamiento y prueba
train_size = int(0.8 * len(dataset))  # 80% para entrenamiento
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Crear DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [None]:
# Inicializar el modelo, la función de pérdida y el optimizador
input_dim = 5000  # TF-IDF max_features
hidden_dim = 128  # Número de unidades ocultas
output_dim = 2    # Número de clases
model = LSTMClassifier(input_dim, hidden_dim, output_dim)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Entrenamiento
epochs = 10
for epoch in range(epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

In [None]:
# Evaluar el modelo
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"Accuracy: {accuracy * 100:.2f}%")
