In [18]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from collections import defaultdict
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

UNKNOWN_TOKEN='<UNK>'
PAD_TOKEN='<PAD>'
START_TOKEN='<START>'
END_TOKEN='<END>'


In [2]:
train_data=pd.read_csv('./train.csv')
test_data=pd.read_csv('./test.csv')

In [3]:
def preprocess_text(data,type='train'):
    sentences=[]
    vocab=set()
    vocab.add(PAD_TOKEN)
    vocab.add(UNKNOWN_TOKEN)

    frequency=dict()
    for text in data:
        text = re.sub(r'[^\w\s\n]', ' ', str(text).lower())
        words = word_tokenize(text)
        words=[START_TOKEN]+words+[END_TOKEN]
        sentences.append(words)
        for word in words:
            frequency[word]=frequency.get(word,0)+1
    
    if type=='train':
        frequency_threshold=3
        for i in range(len(sentences)):
            for j in range(len(sentences[i])):
                if frequency[sentences[i][j]]<frequency_threshold:
                    sentences[i][j]=UNKNOWN_TOKEN

    for sentence in sentences:
        for word in sentence:
            vocab.add(word)

    return sentences,vocab

In [4]:
def build_co_occurrence_matrix(tokens_list, window_size=5):
    co_occurrence_matrix = defaultdict(int)
    for tokens in tokens_list:
        for i in range(len(tokens)):
            for j in range(max(0, i - window_size), min(len(tokens), i + window_size + 1)):
                if i != j:
                    co_occurrence_matrix[(tokens[i], tokens[j])] += 1
    return co_occurrence_matrix

In [5]:
def apply_svd(co_matrix,vocab,vector_size=300):
    words = vocab
    word_index = {word: i for i, word in enumerate(words)}    
    rows, cols, data = [], [], []
    for (word1, word2), count in co_matrix.items():
        rows.append(word_index[word1])
        cols.append(word_index[word2])
        data.append(count)
    co_occurrence_matrix = csr_matrix((data, (rows, cols)), shape=(len(words), len(words)))
    svd = TruncatedSVD(n_components=vector_size)
    word_vectors_svd = svd.fit_transform(co_occurrence_matrix)
    return word_vectors_svd

In [6]:
sentences_train,vocab = preprocess_text(train_data['Description'])
sentences_test,_ = preprocess_text(test_data['Description'],'test')
co_occurrence_matrix = build_co_occurrence_matrix(sentences_train)
word_vectors_svd = apply_svd(co_occurrence_matrix,vocab)

In [7]:
sentence_lengths = [len(sentence) for sentence in sentences_train]
sorted_lengths = sorted(sentence_lengths)
index_95th_percentile = int(np.percentile(range(len(sorted_lengths)), 95))
length_95th_percentile = sorted_lengths[index_95th_percentile]

In [8]:
word_index = {word: i for i, word in enumerate(vocab)}    

length_sentence=length_95th_percentile

X_train = []
for sentence in sentences_train:
    sentence_embedding = [word_vectors_svd[word_index[word]] for word in sentence]
    if len(sentence)<length_sentence:
        padding_needed = length_sentence - len(sentence)
        sentence_embedding.extend(padding_needed*[word_vectors_svd[word_index[PAD_TOKEN]]])
    if sentence_embedding:
        X_train.append((sentence_embedding[:length_sentence]))
y_train = pd.get_dummies(train_data['Class Index'], prefix='value', dtype=int).values

In [9]:
X_test = []
for sentence in sentences_test:
    sentence_embedding = [word_vectors_svd[word_index.get(word,word_index[UNKNOWN_TOKEN])] for word in sentence]
    if len(sentence)<length_sentence:
        padding_needed = length_sentence - len(sentence)
        sentence_embedding.extend(padding_needed*[word_vectors_svd[word_index[PAD_TOKEN]]])
    if sentence_embedding:
        X_test.append((sentence_embedding[:length_sentence]))
y_test = pd.get_dummies(test_data['Class Index'], prefix='value', dtype=int).values


In [10]:
class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNClassifier, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        output, _ = self.rnn(x)
        output = self.fc(output[:, -1, :]) 
        return output

In [11]:
input_size = 300  
hidden_size = 128
output_size = 4
model = RNNClassifier(input_size, hidden_size, output_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)


  X_train_tensor = torch.tensor(X_train, dtype=torch.float32)


In [12]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

epochs = 10
for epoch in range(epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")


Epoch 1/10, Loss: 0.37698838114738464
Epoch 2/10, Loss: 0.335050106048584
Epoch 3/10, Loss: 0.3184419870376587
Epoch 4/10, Loss: 0.22597351670265198
Epoch 5/10, Loss: 0.43549844622612
Epoch 6/10, Loss: 0.3450637757778168
Epoch 7/10, Loss: 0.34369587898254395
Epoch 8/10, Loss: 0.09720759838819504
Epoch 9/10, Loss: 0.4278640151023865
Epoch 10/10, Loss: 0.11683081090450287


In [13]:
print(X_train_tensor.size())
print(y_train_tensor.size())
print(X_test_tensor.size())
print(y_test_tensor.size())


torch.Size([120000, 53, 300])
torch.Size([120000, 4])
torch.Size([7600, 53, 300])
torch.Size([7600, 4])


In [14]:
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    accuracy = accuracy_score(torch.argmax(y_test_tensor, dim=1).numpy(), predicted.numpy())
    print("Accuracy:", accuracy)

Accuracy: 0.8888157894736842


In [3]:
import re
from collections import defaultdict
from nltk.tokenize import word_tokenize
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

PAD_TOKEN = '<PAD>'
UNKNOWN_TOKEN = '<UNKNOWN>'
START_TOKEN = '<START>'
END_TOKEN = '<END>'

class SVDWordEmbeddings:
    def __init__(self, data, frequency_threshold=3, window_size=5, vector_size=300):
        self.data = data
        self.frequency_threshold = frequency_threshold
        self.window_size = window_size
        self.vector_size = vector_size
        self.vocab = set()
        self.vocab.add(PAD_TOKEN)
        self.vocab.add(UNKNOWN_TOKEN)
        self.sentences = []
        self.frequency = defaultdict(int)
        self.co_occurrence_matrix = defaultdict(int)
        self.word_vectors = None

    def preprocess_text(self, type='train'):
        for text in self.data:
            text = re.sub(r'[^\w\s\n]', ' ', str(text).lower())
            words = word_tokenize(text)
            words = [START_TOKEN] + words + [END_TOKEN]
            self.sentences.append(words)
            for word in words:
                self.frequency[word] += 1

        if(type == 'train'):
            for i in range(len(self.sentences)):
                for j in range(len(self.sentences[i])):
                    if self.frequency[self.sentences[i][j]] < self.frequency_threshold:
                        self.sentences[i][j] = UNKNOWN_TOKEN

        for sentence in self.sentences:
            for word in sentence:
                self.vocab.add(word)

    def build_co_occurrence_matrix(self):
        for tokens in self.sentences:
            for i in range(len(tokens)):
                for j in range(max(0, i - self.window_size), min(len(tokens), i + self.window_size + 1)):
                    if i != j:
                        self.co_occurrence_matrix[(tokens[i], tokens[j])] += 1

    def apply_svd(self):
        words = list(self.vocab)
        self.word_index = {word: i for i, word in enumerate(words)}
        rows, cols, data = [], [], []
        for (word1, word2), count in self.co_occurrence_matrix.items():
            rows.append(self.word_index[word1])
            cols.append(self.word_index[word2])
            data.append(count)
        co_occurrence_matrix = csr_matrix((data, (rows, cols)), shape=(len(words), len(words)))
        svd = TruncatedSVD(n_components=self.vector_size)
        self.word_vectors = svd.fit_transform(co_occurrence_matrix)

    def fit(self):
        self.preprocess_text()
        self.build_co_occurrence_matrix()
        self.apply_svd()


In [6]:
embeddings_model_train = SVDWordEmbeddings(train_data['Description'])
embeddings_model_train.fit()
sentences_train=embeddings_model_train.sentences
word_index=embeddings_model_train.word_index
word_vectors_svd=embeddings_model_train.word_vectors

In [7]:
embeddings_model_test = SVDWordEmbeddings(test_data['Description'])
embeddings_model_test.preprocess_text('test')
sentences_test=embeddings_model_test.sentences


In [8]:
sentence_lengths = [len(sentence) for sentence in sentences_train]
sorted_lengths = sorted(sentence_lengths)
index_95th_percentile = int(np.percentile(range(len(sorted_lengths)), 95))
length_95th_percentile = sorted_lengths[index_95th_percentile]
length_sentence=length_95th_percentile

In [9]:
X_train = []
for sentence in sentences_train:
    sentence_embedding = [word_vectors_svd[word_index[word]] for word in sentence]
    if len(sentence)<length_sentence:
        padding_needed = length_sentence - len(sentence)
        sentence_embedding.extend(padding_needed*[word_vectors_svd[word_index[PAD_TOKEN]]])
    if sentence_embedding:
        X_train.append((sentence_embedding[:length_sentence]))
y_train = pd.get_dummies(train_data['Class Index'], prefix='value', dtype=int).values

In [10]:
X_test = []
for sentence in sentences_test:
    sentence_embedding = [word_vectors_svd[word_index.get(word,word_index[UNKNOWN_TOKEN])] for word in sentence]
    if len(sentence)<length_sentence:
        padding_needed = length_sentence - len(sentence)
        sentence_embedding.extend(padding_needed*[word_vectors_svd[word_index[PAD_TOKEN]]])
    if sentence_embedding:
        X_test.append((sentence_embedding[:length_sentence]))
y_test = pd.get_dummies(test_data['Class Index'], prefix='value', dtype=int).values


In [11]:
class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNClassifier, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        output, _ = self.rnn(x)
        output = self.fc(output[:, -1, :]) 
        return output

In [12]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

  X_train_tensor = torch.tensor(X_train, dtype=torch.float32)


In [13]:
input_size = 300
hidden_size = 128
output_size = 4
model = RNNClassifier(input_size, hidden_size, output_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

epochs = 10
for epoch in range(epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

Epoch 1/10, Loss: 0.20513449609279633
Epoch 2/10, Loss: 0.3971790075302124
Epoch 3/10, Loss: 0.23672860860824585
Epoch 4/10, Loss: 0.32392191886901855
Epoch 5/10, Loss: 0.6454268097877502
Epoch 6/10, Loss: 0.19698625802993774
Epoch 7/10, Loss: 0.22427421808242798
Epoch 8/10, Loss: 0.4747859835624695
Epoch 9/10, Loss: 0.27186471223831177
Epoch 10/10, Loss: 0.4595867693424225


In [14]:
model.eval()
with torch.no_grad():
    y_true = []
    y_pred = []
    correct_samples=0
    total_samples=0
    for batch_X, batch_y in train_loader:
        outputs = model(batch_X)
        _, predicted = torch.max(outputs, 1)
        y_true.extend(torch.argmax(batch_y, dim=1).numpy())
        y_pred.extend(predicted.numpy())
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    print("Train Set:")
    print("Accuracy:",accuracy)
    print("Precision:", precision)
    print("Recall:",recall)
    print("F1 Score:",f1)
    print()

model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    y_true=torch.argmax(y_test_tensor, dim=1).numpy()
    y_pred=predicted.numpy()
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    print("Test Set:")
    print("Accuracy:",accuracy)
    print("Precision:", precision)
    print("Recall:",recall)
    print("F1 Score:",f1)

Train Set:
Accuracy: 0.9077416666666667
Precision: 0.9079916119093905
Recall: 0.9077416666666667
F1 Score: 0.907734608689379

Test Set:
Accuracy: 0.8886842105263157
Precision: 0.8888566575936085
Recall: 0.8886842105263157
F1 Score: 0.8886744455785405


In [None]:
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    y_true=torch.argmax(y_test_tensor, dim=1).numpy()
    y_pred=predicted.numpy()
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    print("Test Set:")
    print("Accuracy:",accuracy)
    print("Precision:", precision)
    print("Recall:",recall)
    print("F1 Score:",f1)


0.90515


In [25]:
class SVDWordEmbeddings(nn.Module):
    def __init__(self, data, frequency_threshold=3, window_size=5, vector_size=300):
        self.data = data
        self.frequency_threshold = frequency_threshold
        self.window_size = window_size
        self.vector_size = vector_size
        self.vocab = set()
        self.vocab.add(PAD_TOKEN)
        self.vocab.add(UNKNOWN_TOKEN)
        self.sentences = []
        self.frequency = defaultdict(int)
        self.co_occurrence_matrix = defaultdict(int)
        self.word_vectors = None

    def preprocess_text(self, type='train'):
        for text in self.data:
            text = re.sub(r'[^\w\s\n]', ' ', str(text).lower())
            words = word_tokenize(text)
            words = [START_TOKEN] + words + [END_TOKEN]
            self.sentences.append(words)
            for word in words:
                self.frequency[word] += 1

        if (type == 'train'):
            for i in range(len(self.sentences)):
                for j in range(len(self.sentences[i])):
                    if self.frequency[self.sentences[i][j]] < self.frequency_threshold:
                        self.sentences[i][j] = UNKNOWN_TOKEN

        for sentence in self.sentences:
            for word in sentence:
                self.vocab.add(word)

    def build_co_occurrence_matrix(self):
        for tokens in self.sentences:
            for i in range(len(tokens)):
                for j in range(max(0, i - self.window_size), min(len(tokens), i + self.window_size + 1)):
                    if i != j:
                        self.co_occurrence_matrix[(tokens[i], tokens[j])] += 1

    def apply_svd(self):
        words = list(self.vocab)
        words.sort()
        self.word_index = {word: i for i, word in enumerate(words)}
        rows, cols, data = [], [], []
        for (word1, word2), count in self.co_occurrence_matrix.items():
            rows.append(self.word_index[word1])
            cols.append(self.word_index[word2])
            data.append(count)
        co_occurrence_matrix = csr_matrix(
            (data, (rows, cols)), shape=(len(words), len(words)))
        svd = TruncatedSVD(n_components=self.vector_size)
        self.word_vectors = svd.fit_transform(co_occurrence_matrix)

    def fit(self):
        self.preprocess_text()
        self.build_co_occurrence_matrix()
        self.apply_svd()


In [26]:
from prettytable import PrettyTable

table = PrettyTable()
table.field_names = ["Context Window Size", "Accuracy", "Precision", "Recall", "F1_Score","Confusion Matrix"]


In [27]:
import warnings
warnings.filterwarnings("ignore")

In [28]:
embeddings_model_train = SVDWordEmbeddings(train_data['Description'],window_size=2)
embeddings_model_train.fit()
sentences_train = embeddings_model_train.sentences

word_vectors_svd=embeddings_model_train.word_vectors
word_index_svd=embeddings_model_train.word_index

embeddings_model_test = SVDWordEmbeddings(test_data['Description'],window_size=2)
embeddings_model_test.preprocess_text('test')
sentences_test = embeddings_model_test.sentences

sentence_lengths = [len(sentence) for sentence in sentences_train]
sorted_lengths = sorted(sentence_lengths)
index_95th_percentile = int(np.percentile(range(len(sorted_lengths)), 95))
length_95th_percentile = sorted_lengths[index_95th_percentile]
length_sentence = length_95th_percentile

X_train = []
for sentence in sentences_train:
    sentence_embedding = [word_vectors_svd[word_index_svd[word]] for word in sentence]
    if len(sentence) < length_sentence:
        padding_needed = length_sentence - len(sentence)
        sentence_embedding.extend(padding_needed*[word_vectors_svd[word_index_svd[PAD_TOKEN]]])
    if sentence_embedding:
        X_train.append((sentence_embedding[:length_sentence]))
y_train = pd.get_dummies(train_data['Class Index'], prefix='value', dtype=int).values

X_test = []
for sentence in sentences_test:
    sentence_embedding = [word_vectors_svd[word_index_svd.get(word, word_index_svd[UNKNOWN_TOKEN])] for word in sentence]
    if len(sentence) < length_sentence:
        padding_needed = length_sentence - len(sentence)
        sentence_embedding.extend(padding_needed*[word_vectors_svd[word_index_svd[PAD_TOKEN]]])
    if sentence_embedding:
        X_test.append((sentence_embedding[:length_sentence]))
y_test = pd.get_dummies(test_data['Class Index'], prefix='value', dtype=int).values

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNClassifier, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        output, _ = self.rnn(x)
        output = self.fc(output[:, -1, :])
        return output

input_size = 300
hidden_size = 128
output_size = 4
model = RNNClassifier(input_size, hidden_size, output_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

epochs = 10
for epoch in range(epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

model.eval()
with torch.no_grad():
    y_true = []
    y_pred = []
    correct_samples=0
    total_samples=0
    for batch_X, batch_y in train_loader:
        outputs = model(batch_X)
        _, predicted = torch.max(outputs, 1)
        y_true.extend(torch.argmax(batch_y, dim=1).numpy())
        y_pred.extend(predicted.numpy())
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)
    print("Train Set:")
    print("Accuracy:",accuracy)
    print("Precision:", precision)
    print("Recall:",recall)
    print("F1 Score:",f1)
    print("Confusion Matrix:",cm)
    print()

model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    y_true=torch.argmax(y_test_tensor, dim=1).numpy()
    y_pred=predicted.numpy()
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)
    print("Test Set:")
    print("Accuracy:",accuracy)
    print("Precision:", precision)
    print("Recall:",recall)
    print("F1 Score:",f1)
    print("Confusion Matrix:",cm)

table.add_row([2,accuracy,precision,recall,f1,cm])


Epoch 1/10, Loss: 0.37380844354629517
Epoch 2/10, Loss: 0.3816951513290405
Epoch 3/10, Loss: 0.21368353068828583
Epoch 4/10, Loss: 0.3267323970794678
Epoch 5/10, Loss: 0.27925753593444824
Epoch 6/10, Loss: 0.4329754710197449
Epoch 7/10, Loss: 0.3113632798194885
Epoch 8/10, Loss: 0.2926386296749115
Epoch 9/10, Loss: 0.089718759059906
Epoch 10/10, Loss: 0.20039130747318268
Train Set:
Accuracy: 0.8882083333333334
Precision: 0.8913247614082769
Recall: 0.8882083333333334
F1 Score: 0.8883534000112535
Confusion Matrix: [[25969   920  1395  1716]
 [  423 28716   322   539]
 [  909   295 24467  4329]
 [  786   215  1566 27433]]

Test Set:
Accuracy: 0.8697368421052631
Precision: 0.8722722760788111
Recall: 0.8697368421052631
F1 Score: 0.86983440982138
Confusion Matrix: [[1608   78  107  107]
 [  37 1791   29   43]
 [  69   25 1517  289]
 [  61   16  129 1694]]


In [29]:
import warnings
warnings.filterwarnings("ignore")

In [30]:
embeddings_model_train = SVDWordEmbeddings(train_data['Description'],window_size=5)
embeddings_model_train.fit()
sentences_train = embeddings_model_train.sentences

word_vectors_svd=embeddings_model_train.word_vectors
word_index_svd=embeddings_model_train.word_index

embeddings_model_test = SVDWordEmbeddings(test_data['Description'],window_size=5)
embeddings_model_test.preprocess_text('test')
sentences_test = embeddings_model_test.sentences

sentence_lengths = [len(sentence) for sentence in sentences_train]
sorted_lengths = sorted(sentence_lengths)
index_95th_percentile = int(np.percentile(range(len(sorted_lengths)), 95))
length_95th_percentile = sorted_lengths[index_95th_percentile]
length_sentence = length_95th_percentile

X_train = []
for sentence in sentences_train:
    sentence_embedding = [word_vectors_svd[word_index_svd[word]] for word in sentence]
    if len(sentence) < length_sentence:
        padding_needed = length_sentence - len(sentence)
        sentence_embedding.extend(padding_needed*[word_vectors_svd[word_index_svd[PAD_TOKEN]]])
    if sentence_embedding:
        X_train.append((sentence_embedding[:length_sentence]))
y_train = pd.get_dummies(train_data['Class Index'], prefix='value', dtype=int).values

X_test = []
for sentence in sentences_test:
    sentence_embedding = [word_vectors_svd[word_index_svd.get(word, word_index_svd[UNKNOWN_TOKEN])] for word in sentence]
    if len(sentence) < length_sentence:
        padding_needed = length_sentence - len(sentence)
        sentence_embedding.extend(padding_needed*[word_vectors_svd[word_index_svd[PAD_TOKEN]]])
    if sentence_embedding:
        X_test.append((sentence_embedding[:length_sentence]))
y_test = pd.get_dummies(test_data['Class Index'], prefix='value', dtype=int).values

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNClassifier, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        output, _ = self.rnn(x)
        output = self.fc(output[:, -1, :])
        return output

input_size = 300
hidden_size = 128
output_size = 4
model = RNNClassifier(input_size, hidden_size, output_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

epochs = 10
for epoch in range(epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

model.eval()
with torch.no_grad():
    y_true = []
    y_pred = []
    correct_samples=0
    total_samples=0
    for batch_X, batch_y in train_loader:
        outputs = model(batch_X)
        _, predicted = torch.max(outputs, 1)
        y_true.extend(torch.argmax(batch_y, dim=1).numpy())
        y_pred.extend(predicted.numpy())
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)
    print("Train Set:")
    print("Accuracy:",accuracy)
    print("Precision:", precision)
    print("Recall:",recall)
    print("F1 Score:",f1)
    print("Confusion Matrix:",cm)
    print()

model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    y_true=torch.argmax(y_test_tensor, dim=1).numpy()
    y_pred=predicted.numpy()
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)
    print("Test Set:")
    print("Accuracy:",accuracy)
    print("Precision:", precision)
    print("Recall:",recall)
    print("F1 Score:",f1)
    print("Confusion Matrix:",cm)

table.add_row([5,accuracy,precision,recall,f1,cm])


Epoch 1/10, Loss: 0.1591673493385315
Epoch 2/10, Loss: 0.22575244307518005
Epoch 3/10, Loss: 0.3940945267677307
Epoch 4/10, Loss: 0.058712538331747055
Epoch 5/10, Loss: 0.36005276441574097
Epoch 6/10, Loss: 0.5769515037536621
Epoch 7/10, Loss: 0.3973618745803833
Epoch 8/10, Loss: 0.1815149337053299
Epoch 9/10, Loss: 0.29045355319976807
Epoch 10/10, Loss: 0.1860528588294983
Train Set:
Accuracy: 0.9055083333333334
Precision: 0.906025028929943
Recall: 0.9055083333333334
F1 Score: 0.9053426428469515
Confusion Matrix: [[26899   750  1314  1037]
 [  460 29134   182   224]
 [ 1016   299 25332  3353]
 [  921   244  1539 27296]]

Test Set:
Accuracy: 0.8877631578947368
Precision: 0.8882956075505102
Recall: 0.8877631578947368
F1 Score: 0.8875985193227585
Confusion Matrix: [[1679   51   98   72]
 [  33 1818   30   19]
 [  72   28 1557  243]
 [  78   21  108 1693]]


In [36]:
embeddings_model_train = SVDWordEmbeddings(train_data['Description'],window_size=10)
embeddings_model_train.fit()
sentences_train = embeddings_model_train.sentences

word_vectors_svd=embeddings_model_train.word_vectors
word_index_svd=embeddings_model_train.word_index

embeddings_model_test = SVDWordEmbeddings(test_data['Description'],window_size=10)
embeddings_model_test.preprocess_text('test')
sentences_test = embeddings_model_test.sentences

sentence_lengths = [len(sentence) for sentence in sentences_train]
sorted_lengths = sorted(sentence_lengths)
index_95th_percentile = int(np.percentile(range(len(sorted_lengths)), 95))
length_95th_percentile = sorted_lengths[index_95th_percentile]
length_sentence = length_95th_percentile

X_train = []
for sentence in sentences_train:
    sentence_embedding = [word_vectors_svd[word_index_svd[word]] for word in sentence]
    if len(sentence) < length_sentence:
        padding_needed = length_sentence - len(sentence)
        sentence_embedding.extend(padding_needed*[word_vectors_svd[word_index_svd[PAD_TOKEN]]])
    if sentence_embedding:
        X_train.append((sentence_embedding[:length_sentence]))
y_train = pd.get_dummies(train_data['Class Index'], prefix='value', dtype=int).values

X_test = []
for sentence in sentences_test:
    sentence_embedding = [word_vectors_svd[word_index_svd.get(word, word_index_svd[UNKNOWN_TOKEN])] for word in sentence]
    if len(sentence) < length_sentence:
        padding_needed = length_sentence - len(sentence)
        sentence_embedding.extend(padding_needed*[word_vectors_svd[word_index_svd[PAD_TOKEN]]])
    if sentence_embedding:
        X_test.append((sentence_embedding[:length_sentence]))
y_test = pd.get_dummies(test_data['Class Index'], prefix='value', dtype=int).values

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNClassifier, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        output, _ = self.rnn(x)
        output = self.fc(output[:, -1, :])
        return output

input_size = 300
hidden_size = 128
output_size = 4
model = RNNClassifier(input_size, hidden_size, output_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

epochs = 10
for epoch in range(epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

model.eval()
with torch.no_grad():
    y_true = []
    y_pred = []
    correct_samples=0
    total_samples=0
    for batch_X, batch_y in train_loader:
        outputs = model(batch_X)
        _, predicted = torch.max(outputs, 1)
        y_true.extend(torch.argmax(batch_y, dim=1).numpy())
        y_pred.extend(predicted.numpy())
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)
    print("Train Set:")
    print("Accuracy:",accuracy)
    print("Precision:", precision)
    print("Recall:",recall)
    print("F1 Score:",f1)
    print("Confusion Matrix:",cm)
    print()

model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    y_true=torch.argmax(y_test_tensor, dim=1).numpy()
    y_pred=predicted.numpy()
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)
    print("Test Set:")
    print("Accuracy:",accuracy)
    print("Precision:", precision)
    print("Recall:",recall)
    print("F1 Score:",f1)
    print("Confusion Matrix:",cm)

table.add_row([10,accuracy,precision,recall,f1,cm])


Epoch 1/10, Loss: 0.37004002928733826
Epoch 2/10, Loss: 0.4647955596446991
Epoch 3/10, Loss: 0.2004903256893158
Epoch 4/10, Loss: 0.14736691117286682
Epoch 5/10, Loss: 0.1522027999162674
Epoch 6/10, Loss: 0.4400908648967743
Epoch 7/10, Loss: 0.34749236702919006
Epoch 8/10, Loss: 0.25368332862854004
Epoch 9/10, Loss: 0.35846662521362305
Epoch 10/10, Loss: 0.29595857858657837
Train Set:
Accuracy: 0.9116333333333333
Precision: 0.9126153228840616
Recall: 0.9116333333333333
F1 Score: 0.9116187184646952
Confusion Matrix: [[26348   917  1729  1006]
 [  161 29510   211   118]
 [  573   216 26881  2330]
 [  635   269  2439 26657]]

Test Set:
Accuracy: 0.8957894736842106
Precision: 0.8970712150879531
Recall: 0.8957894736842106
F1 Score: 0.8959381792567103
Confusion Matrix: [[1647   59  116   78]
 [  17 1837   29   17]
 [  42   19 1669  170]
 [  49   23  173 1655]]


In [37]:
print(table)

+---------------------+--------------------+--------------------+--------------------+--------------------+-------------------------+
| Context Window Size |      Accuracy      |     Precision      |       Recall       |      F1_Score      |     Confusion Matrix    |
+---------------------+--------------------+--------------------+--------------------+--------------------+-------------------------+
|          2          | 0.8697368421052631 | 0.8722722760788111 | 0.8697368421052631 |  0.86983440982138  |  [[1608   78  107  107] |
|                     |                    |                    |                    |                    |   [  37 1791   29   43] |
|                     |                    |                    |                    |                    |   [  69   25 1517  289] |
|                     |                    |                    |                    |                    |  [  61   16  129 1694]] |
|          5          | 0.8877631578947368 | 0.888295607550510