# Sentiment Classification

## Dataset Feature Extraction

In [1]:
import torch
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cpu


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [5]:
# path = "/content/drive/Shareddrives/G5/project-4-sentiment-classification/"
path = "./"

In [6]:
import pandas as pd

train_data = pd.read_csv(path + "train.csv")
test_data = pd.read_csv(path + "test.csv")

print(train_data.shape)
print(test_data.shape)
train_data.head()

(25000, 2)
(25000, 1)


Unnamed: 0,message,label
0,I saw this movie in NEW York city. I was waiti...,neg
1,This is a German film from 1974 that is someth...,neg
2,I attempted watching this movie twice and even...,neg
3,On his birthday a small boys tells his mother ...,neg
4,"The person who wrote the review ""enough with t...",pos


In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jeffr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jeffr\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\jeffr\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [8]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalpha() and token.lower() not in stop_words]
    return ' '.join(tokens)

In [9]:
train_data['tokens'] = train_data['message'].apply(preprocess_text)
# test_data_preprocessed = test_data['message'].apply(preprocess_text)

In [10]:
train_data.head()

Unnamed: 0,message,label,tokens
0,I saw this movie in NEW York city. I was waiti...,neg,saw movie new york city waiting bus next morni...
1,This is a German film from 1974 that is someth...,neg,german film something woman come castle beyond...
2,I attempted watching this movie twice and even...,neg,attempted watching movie twice even fast forwa...
3,On his birthday a small boys tells his mother ...,neg,birthday small boy tell mother son want go hom...
4,"The person who wrote the review ""enough with t...",pos,person wrote review enough sweating spitting a...


In [11]:
max_len = max(train_data['tokens'].apply(lambda text: len(text.split())))
print(max_len)

1421


In [12]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['label'])

In [13]:
train_data.head()

Unnamed: 0,message,label,tokens
0,I saw this movie in NEW York city. I was waiti...,0,saw movie new york city waiting bus next morni...
1,This is a German film from 1974 that is someth...,0,german film something woman come castle beyond...
2,I attempted watching this movie twice and even...,0,attempted watching movie twice even fast forwa...
3,On his birthday a small boys tells his mother ...,0,birthday small boy tell mother son want go hom...
4,"The person who wrote the review ""enough with t...",1,person wrote review enough sweating spitting a...


In [None]:
# train_data_preprocessed = pd.concat([train_data_preprocessed, train_data['label']], axis=1)

### Word Embedding

In [15]:
from gensim.models import Word2Vec

sentences = train_data['tokens'].apply(lambda x: x.split()).to_list()

word2vec = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, sg=1)

In [16]:
def sentence_to_vectors(sentence, model, vector_size=100):
    vectors = []
    for word in sentence.split():
        if word in model.wv:
            vectors.append(model.wv[word])
        else:
            vectors.append([0] * vector_size)
    return vectors

In [17]:
train_data['vectors'] = train_data['tokens'].apply(lambda x: sentence_to_vectors(x, word2vec))

In [18]:
train_data['vectors'] = train_data['vectors'].apply(np.array)

In [None]:
from torch.nn.utils.rnn import pad_sequence
import torch
import numpy as np

# Reduce max_len
def pad_sentences(vectors, max_len, vector_size=100):
    if len(vectors) > max_len:
        vectors = vectors[:max_len]
    else:
        padding = np.zeros((max_len - len(vectors), vector_size))
        vectors = np.vstack([vectors, padding])
    return torch.tensor(vectors, dtype=torch.float32)


In [20]:
train_data['padded_vectors'] = train_data['vectors'].apply(lambda x: pad_sentences(np.array(x), max_len, word2vec.vector_size))

In [26]:
train_data.to_csv(path + 'train_preprocessed.csv', index=False)

In [25]:
X = torch.stack(train_data['padded_vectors'].tolist())
y = torch.tensor(train_data['label'].values)

### TF-IDF Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2), lowercase=True)
X_train = vectorizer.fit_transform(train['message'])

print(X_train.shape)

(25000, 5000)


In [None]:
print(X_train)

  (0, 3867)	0.24998872545018597
  (0, 2912)	0.11242199684392602
  (0, 3060)	0.11927197024321219
  (0, 4985)	0.18355799824437657
  (0, 823)	0.16809344373899543
  (0, 4786)	0.18993407919273902
  (0, 628)	0.2422105409252342
  (0, 2899)	0.4412603841129476
  (0, 4802)	0.11037667839688935
  (0, 4784)	0.17858376429652753
  (0, 4867)	0.15373758799338144
  (0, 4280)	0.17687764294688155
  (0, 4940)	0.13281789522993748
  (0, 1631)	0.06024820855862076
  (0, 2542)	0.10119673959126436
  (0, 288)	0.1921119299981152
  (0, 801)	0.22849793750841113
  (0, 4227)	0.16953796180828518
  (0, 4974)	0.10351508061438297
  (0, 1028)	0.18627142632603821
  (0, 4229)	0.23101054228387058
  (0, 3869)	0.19210182023899758
  (0, 3061)	0.18640875225904535
  (0, 4986)	0.242499998479016
  (0, 4941)	0.20947163233508553
  :	:
  (24998, 3241)	0.22383527690408916
  (24998, 3048)	0.20681969959007346
  (24998, 3049)	0.21747283241125134
  (24998, 3686)	0.2001612762260745
  (24998, 3820)	0.20659747344733098
  (24998, 4343)	0.201089