# Sentiment Classification

## Dataset Feature Extraction

In [2]:
import torch
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# path = "/content/drive/Shareddrives/G5/project-4-sentiment-classification/"
path = "./"

In [39]:
import pandas as pd

train_data = pd.read_csv(path + "train.csv")
test_data = pd.read_csv(path + "test.csv")

print(train_data.shape)
print(test_data.shape)
train_data.head()

(25000, 2)
(25000, 1)


Unnamed: 0,message,label
0,I saw this movie in NEW York city. I was waiti...,neg
1,This is a German film from 1974 that is someth...,neg
2,I attempted watching this movie twice and even...,neg
3,On his birthday a small boys tells his mother ...,neg
4,"The person who wrote the review ""enough with t...",pos


Adding nltk libraries for text processing

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jeffr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jeffr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\jeffr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

Function that tokenize text

In [40]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalpha() and token.lower() not in stop_words]
    return ' '.join(tokens)

Add the tokenized text to a new column in train and test dataframes

In [41]:
train_data['tokens'] = train_data['message'].apply(preprocess_text)
test_data['tokens'] = test_data['message'].apply(preprocess_text)

In [42]:
train_data.head()

Unnamed: 0,message,label,tokens
0,I saw this movie in NEW York city. I was waiti...,neg,saw movie new york city waiting bus next morni...
1,This is a German film from 1974 that is someth...,neg,german film something woman come castle beyond...
2,I attempted watching this movie twice and even...,neg,attempted watching movie twice even fast forwa...
3,On his birthday a small boys tells his mother ...,neg,birthday small boy tell mother son want go hom...
4,"The person who wrote the review ""enough with t...",pos,person wrote review enough sweating spitting a...


In [43]:
test_data.head()

Unnamed: 0,message,tokens
0,Acclaimed Argentine horror director Emilio Vie...,acclaimed argentine horror director emilio vie...
1,I don't know if it's fair for me to review thi...,know fair review fan gratuitous violence never...
2,The only good thing about Persepolis is the sh...,good thing persepolis shadow created german an...
3,I completely forgot that I'd seen this within ...,completely forgot seen within couple day prett...
4,B. Kennedy tried to make a sequel by exaggerat...,kennedy tried make sequel exaggerating gargant...


Encoding the labels (0: pos, 1: neg)

In [44]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()  
train_data['label'] = label_encoder.fit_transform(train_data['label'])

In [45]:
train_data.head()

Unnamed: 0,message,label,tokens
0,I saw this movie in NEW York city. I was waiti...,0,saw movie new york city waiting bus next morni...
1,This is a German film from 1974 that is someth...,0,german film something woman come castle beyond...
2,I attempted watching this movie twice and even...,0,attempted watching movie twice even fast forwa...
3,On his birthday a small boys tells his mother ...,0,birthday small boy tell mother son want go hom...
4,"The person who wrote the review ""enough with t...",1,person wrote review enough sweating spitting a...


### Word Embedding (Word2Vec)

Method to feature text extraction. The class Word2Vec is a neural network that is trained with all the tokens of the text data

In [46]:
from gensim.models import Word2Vec

sentences = train_data['tokens'].apply(lambda x: x.split()).to_list()

word2vec = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, sg=1)

Training the word2vec model for test data

In [47]:
sentences_test = test_data['tokens'].apply(lambda x: x.split()).to_list()

word2vec_test = Word2Vec(sentences=sentences_test, vector_size=100, window=5, min_count=1, sg=1)

Saving the model

In [49]:
# Guardar el modelo
word2vec.save(path + "word2vec_train_model")
word2vec_test.save(path + "word2vec_test_model")

# Cargar el modelo
# word2vec = Word2Vec.load("word2vec_test_model")
# word2vec_test = Word2Vec.load("word2vec_test_model")

Function that maps all the tokens of each row with its corresponding feature vector

In [50]:
def sentence_to_vectors(sentence, model, vector_size=100):
    vectors = []
    for word in sentence.split():
        if word in model.wv:
            vectors.append(model.wv[word])
        else:
            vectors.append([0] * vector_size)
    return vectors

In [21]:
max_len = max(train_data['tokens'].apply(lambda text: len(text.split())))
mean_len = round(np.mean(train_data['tokens'].apply(lambda text: len(text.split()))))

print(f'Max length: {max_len}')
print(f'Mean length: {mean_len}')

Max length: 1421
Mean length: 119


Saving the feature vectors in a new column and normalizing the vectors

In [51]:
train_data['vectors'] = train_data['tokens'].apply(lambda x: sentence_to_vectors(x, word2vec))
test_data['vectors'] = test_data['tokens'].apply(lambda x: sentence_to_vectors(x, word2vec_test))

In [52]:
train_data.head()

Unnamed: 0,message,label,tokens,vectors
0,I saw this movie in NEW York city. I was waiti...,0,saw movie new york city waiting bus next morni...,"[[0.09326658, 0.10232236, -0.5063842, -0.41179..."
1,This is a German film from 1974 that is someth...,0,german film something woman come castle beyond...,"[[-0.32682985, 0.27852288, -0.18955359, -0.486..."
2,I attempted watching this movie twice and even...,0,attempted watching movie twice even fast forwa...,"[[-0.03599018, -0.006980085, 0.29945117, 0.421..."
3,On his birthday a small boys tells his mother ...,0,birthday small boy tell mother son want go hom...,"[[0.06317541, 0.4016989, -0.64858943, -0.37955..."
4,"The person who wrote the review ""enough with t...",1,person wrote review enough sweating spitting a...,"[[-0.074894525, 0.2501159, -0.14314865, 0.4145..."


Normalize the feature vectors

In [53]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
train_data['vectors'] = train_data['vectors'].apply(lambda x: scaler.fit_transform(x))
test_data['vectors'] = test_data['vectors'].apply(lambda x: scaler.fit_transform(x))

In [102]:
train_data.head()

Unnamed: 0,message,label,tokens,vectors,tfidf
0,I saw this movie in NEW York city. I was waiti...,0,saw movie new york city waiting bus next morni...,"[[0.7613983973953181, 0.37214972655208, 0.0183...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,This is a German film from 1974 that is someth...,0,german film something woman come castle beyond...,"[[0.35351079310495453, 0.5796608186564652, 0.3...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,I attempted watching this movie twice and even...,0,attempted watching movie twice even fast forwa...,"[[0.6356779305619685, 0.25589794030384716, 0.8...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,On his birthday a small boys tells his mother ...,0,birthday small boy tell mother son want go hom...,"[[0.5706008552945276, 0.826410954348949, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"The person who wrote the review ""enough with t...",1,person wrote review enough sweating spitting a...,"[[0.7012216743999262, 0.6190995233686329, 0.41...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [56]:
test_data.head()

Unnamed: 0,message,tokens,vectors
0,Acclaimed Argentine horror director Emilio Vie...,acclaimed argentine horror director emilio vie...,"[[0.2517718570337515, 0.4632532813656189, 0.56..."
1,I don't know if it's fair for me to review thi...,know fair review fan gratuitous violence never...,"[[0.414121311590642, 0.19043247456763368, 0.53..."
2,The only good thing about Persepolis is the sh...,good thing persepolis shadow created german an...,"[[0.5413249950897219, 0.661586795968182, 0.496..."
3,I completely forgot that I'd seen this within ...,completely forgot seen within couple day prett...,"[[0.5895263409703918, 0.4695384655903072, 0.61..."
4,B. Kennedy tried to make a sequel by exaggerat...,kennedy tried make sequel exaggerating gargant...,"[[0.6394430032299151, 0.4203580853765238, 0.41..."


### TF-IDF Vectorizer

This method uses the TF-IDF algorithm to extract features from the text data according to the frequency and importance of each word in the row

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2), lowercase=True)
X_train_tfidf = vectorizer.fit_transform(train_data['tokens']).toarray()
X_test_tfidf = vectorizer.transform(test_data['tokens']).toarray()

print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

(25000, 5000)
(25000, 5000)


In [None]:
print(np.unique(X_train_tfidf[4]))

[0.         0.02946132 0.03679143 0.05483793 0.05497426 0.05987872
 0.06179367 0.06449474 0.06589665 0.06796903 0.07032864 0.07417273
 0.07567818 0.07883546 0.07951672 0.08184364 0.08205103 0.08275188
 0.08418937 0.08430638 0.08566853 0.08757607 0.08869587 0.08991665
 0.09313713 0.09370546 0.09634121 0.09914383 0.09950548 0.09982102
 0.10428246 0.11119358 0.1127536  0.11350027 0.11473618 0.11605794
 0.1177484  0.11960372 0.11975421 0.1218273  0.1239646  0.12724801
 0.14180076 0.17514435 0.18733408 0.21296585 0.23041254 0.23861249
 0.31767079 0.32054147 0.44773859]


: 

In [63]:
train_data['tfidf'] = list(X_train_tfidf)
test_data['tfidf'] = list(X_test_tfidf)

In [46]:
train_data.head()

Unnamed: 0,message,label,tokens,vectors,tfidf
0,I saw this movie in NEW York city. I was waiti...,0,saw movie new york city waiting bus next morni...,"[[0.7635628566251051, 0.43391089419340095, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,This is a German film from 1974 that is someth...,0,german film something woman come castle beyond...,"[[0.2417061183155192, 0.5087659396431822, 0.38...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,I attempted watching this movie twice and even...,0,attempted watching movie twice even fast forwa...,"[[0.6134121699946612, 0.29120593555177465, 0.6...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,On his birthday a small boys tells his mother ...,0,birthday small boy tell mother son want go hom...,"[[0.6755493421548452, 0.8097028871608277, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"The person who wrote the review ""enough with t...",1,person wrote review enough sweating spitting a...,"[[0.7727563516445829, 0.5337629343724954, 0.40...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [64]:
train_data.to_csv(path + "train_data_preprocessed.csv", index=False)
test_data.to_csv(path + "test_data_preprocessed.csv", index=False)

In [9]:
max_len = max(train_data['tokens'].apply(lambda text: len(text.split())))
mean_len = np.mean(train_data['tokens'].apply(lambda text: len(text.split())))

print(f'Max length: {max_len}')
print(f'Mean length: {round(mean_len)}')

Max length: 1421
Mean length: 119


This function truncate each row that have more than `mean_len` words or adds a zeroes vector for each row that have less than `mean_len` words. Transforms the data into torch


In [33]:
import numpy as np

mean_len = int(mean_len)

def pad_sentences(vectors, max_len, vector_size=100):
    if len(vectors) > max_len:
        vectors = vectors[:max_len]
    else:
        padding = np.zeros((max_len - len(vectors), vector_size))
        vectors = np.vstack([vectors, padding])
    return vectors.tolist()

In [36]:
train_data['tensor'] = train_data['vectors'].apply(lambda x: pad_sentences(x, mean_len, word2vec.vector_size))
test_data['tensor'] = test_data['vectors'].apply(lambda x: pad_sentences(x, mean_len, word2vec_test.vector_size))

## Models


### Word2Vec

In [None]:
X = torch.stack(train_data['padded_vectors'].tolist())
y = torch.tensor(train_data['label'].values)

In [None]:
from torch.utils.data import DataLoader, TensorDataset, random_split

# Crear un dataset de PyTorch
dataset = TensorDataset(X, y)

# Dividir en entrenamiento y prueba
train_size = int(0.8 * len(dataset))  # 80% para entrenamiento
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Crear DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Inicializar el modelo, la función de pérdida y el optimizador
input_dim = word2vec.vector_size
hidden_dim = 128
output_dim = 2
model = LSTMClassifier(input_dim, hidden_dim, output_dim)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Entrenamiento
epochs = 20
losses = []
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for inputs, labels in train_loader:
        # inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)
    losses.append(avg_loss)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss}")

Epoch 1/20, Loss: 0.6716709010601044
Epoch 2/20, Loss: 0.6755033177375793
Epoch 3/20, Loss: 0.6814942306518554
Epoch 4/20, Loss: 0.5333627909660339
Epoch 5/20, Loss: 0.3804534206867218
Epoch 6/20, Loss: 0.3511510826826096
Epoch 7/20, Loss: 0.33705521540641786
Epoch 8/20, Loss: 0.32991589548587796
Epoch 9/20, Loss: 0.32034432963728904
Epoch 10/20, Loss: 0.31565868777036665
Epoch 11/20, Loss: 0.31078529708385466
Epoch 12/20, Loss: 0.30663199263811114
Epoch 13/20, Loss: 0.29701457860469815
Epoch 14/20, Loss: 0.2948506070137024
Epoch 15/20, Loss: 0.2878551222205162
Epoch 16/20, Loss: 0.281745883500576
Epoch 17/20, Loss: 0.27399428837299344
Epoch 18/20, Loss: 0.266418141579628
Epoch 19/20, Loss: 0.25981831710338593
Epoch 20/20, Loss: 0.24420327085256577


In [None]:
# Evaluar el modelo
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 85.34%


### TfidfVectorizer

In [None]:
X_tensor = torch.tensor(X_tfidf, dtype=torch.float32)
y_tensor = torch.tensor(train_data['label'].values)

In [None]:
from torch.utils.data import DataLoader, TensorDataset, random_split

# Crear un dataset de PyTorch
dataset = TensorDataset(X_tensor, y_tensor)

# Dividir en entrenamiento y prueba
train_size = int(0.8 * len(dataset))  # 80% para entrenamiento
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Crear DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [None]:
# Inicializar el modelo, la función de pérdida y el optimizador
input_dim = 5000  # TF-IDF max_features
hidden_dim = 128  # Número de unidades ocultas
output_dim = 2    # Número de clases
model = LSTMClassifier(input_dim, hidden_dim, output_dim)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Entrenamiento
epochs = 10
for epoch in range(epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

Epoch 1/10, Loss: 0.2088390737771988
Epoch 2/10, Loss: 0.3516410291194916
Epoch 3/10, Loss: 0.29308614134788513
Epoch 4/10, Loss: 0.1689564734697342
Epoch 5/10, Loss: 0.14249277114868164
Epoch 6/10, Loss: 0.06334441900253296
Epoch 7/10, Loss: 0.08491527289152145
Epoch 8/10, Loss: 0.062416449189186096
Epoch 9/10, Loss: 0.07067599147558212
Epoch 10/10, Loss: 0.03959621489048004


In [None]:
# Evaluar el modelo
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 83.72%
