In [1]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from collections import defaultdict
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

UNKNOWN_TOKEN='UNK'
PAD_TOKEN='PAD'
START_TOKEN = '<START>'
END_TOKEN = '<END>'


In [2]:
train_data=pd.read_csv('./train.csv')
test_data=pd.read_csv('./test.csv')


In [39]:
def preprocess_text(data,type='train'):
    sentences=[]
    vocab=set()
    vocab.add(PAD_TOKEN)
    vocab.add(UNKNOWN_TOKEN)
    total=0

    frequency=dict()
    for text in data:
        text = re.sub(r'[^\w\s\n]', ' ', str(text).lower())
        words = word_tokenize(text)
        words = [START_TOKEN] + words + [END_TOKEN]
        sentences.append(words)
        for word in words:
            frequency[word]=frequency.get(word,0)+1
            total+=1
    
    if type=='train':
        frequency_threshold=3
        for i in range(len(sentences)):
            for j in range(len(sentences[i])):
                if frequency[sentences[i][j]]<frequency_threshold:
                    sentences[i][j]=UNKNOWN_TOKEN

    for sentence in sentences:
        for word in sentence:
            vocab.add(word)
    vocab=list(vocab)
    return sentences,vocab

In [40]:
sentences_train,vocab = preprocess_text(train_data['Description'])
sentences_test,_ = preprocess_text(test_data['Description'],'test')

In [41]:
print(len(vocab))

32009


In [42]:
# word2ind = {}
# ind2word = {}

# for ind, word in enumerate(vocab):
#     word2ind[word] = ind
#     ind2word[ind] = word

# for i in range(len(sentences_train)):
#     sentences_train[i]=[word2ind[word] for word in sentences_train[i]]

In [43]:
# def get_pairs(corpus,window_size=5):
#     word_pairs=[]
#     for sentence in corpus:
#         for i in range(len(sentence)):
#             for j in range(max(0,i-window_size),min(i+window_size+1,len(sentence))):
#                 if(i!=j):
#                     word_pairs.append((sentence[i],sentence[j]))
#     return word_pairs

In [44]:
import numpy as np
from collections import defaultdict
import random
    
class SkipGramNegativeSampling:
    def __init__(self, corpus, vector_size=300, window_size=5, negative_samples=5, learning_rate=0.025):
        self.corpus = corpus
        self.vector_size = vector_size
        self.window_size = window_size
        self.negative_samples = negative_samples
        self.learning_rate = learning_rate
        self.word2id = {}
        self.id2word = {}
        self.vocab_size = 0
        self.word_count = defaultdict(int)
        self.word_pairs = []
        self.initialize()

    def initialize(self):
        for sentence in self.corpus:
            for word in sentence:
                self.word_count[word] += 1
        sorted_vocab = sorted(self.word_count, key=self.word_count.get, reverse=True)
        for i, word in enumerate(sorted_vocab):
            self.word2id[word] = i
            self.id2word[i] = word
        self.vocab_size = len(self.word2id)

        threshold=1e-5
        negative_subsamples=[]
        for word, freq in self.word_count.items():
            keep_prob = (np.sqrt(freq / threshold) + 1) * (threshold / freq)
            if random.random() < keep_prob:
                negative_subsamples.extend([word] * self.word_count[word])

    def similarity(self, vec1, vec2):
        dot_product = np.dot(vec1, vec2)
        return dot_product
    

    def generate_word_pairs(self):
        word_pairs=set()
        for sentence in self.corpus:
            for i, target_word in enumerate(sentence):
                target_word_id = self.word2id[target_word]
                for j in range(i - self.window_size, i + self.window_size + 1):
                    if j != i and j >= 0 and j < len(sentence):
                        context_word = sentence[j]
                        context_word_id = self.word2id[context_word]
                        word_pairs.add((target_word_id, context_word_id))
        self.word_pairs=list(word_pairs)
        random.shuffle(self.word_pairs)

    def train(self, epochs=5):
        self.generate_word_pairs()
        self.W = np.random.uniform(-0.5/self.vector_size, 0.5/self.vector_size, (self.vocab_size, self.vector_size))
        self.C = np.random.uniform(-0.5/self.vector_size, 0.5/self.vector_size, (self.vocab_size, self.vector_size))

        for epoch in range(epochs):
            loss = 0
            for target_word_id, context_word_id in self.word_pairs:
                loss += self.train_pair(target_word_id, context_word_id)
            print(f"Epoch {epoch + 1}: Loss = {loss / len(self.word_pairs)}")
    
    def train_pair(self, target_word_id, context_word_id):
        target_vector = self.W[target_word_id] # w
        context_vector = self.C[context_word_id] # c_pos

        negative_samples = random.choices(range(self.vocab_size), k=self.negative_samples)

        similarity_score = self.similarity(target_vector, context_vector) # w.c_pos
        sigmoid_score = self.sigmoid(similarity_score) # sigmoid(w.c_pos)
        gradients = (sigmoid_score - 1) * context_vector 
        self.C[context_word_id]-=self.learning_rate *(sigmoid_score - 1) * target_vector # C^(t+1)_pos=C^(t)_pos-lr*(sigmoid(c_pos.w)-1)*w
        loss=-np.log(sigmoid_score+1e-10) # log(sigmoid(c_pos.w))

        for sample_word_id in negative_samples :
            sample_vector = self.C[sample_word_id] # c_neg
            similarity_score = self.similarity(target_vector, sample_vector) # w.c_neg
            sigmoid_score = self.sigmoid(similarity_score) # sigmoid(w.c_neg)
            gradients += sigmoid_score * sample_vector
            self.C[sample_word_id]-=self.learning_rate*sigmoid_score*target_vector # C^(t+1)_neg=C^(t)_neg-lr*sigmoid(c_neg.w)*w
            loss+=-np.log(1-sigmoid_score+1e-10) # log(1-sigmoid(c_neg.w))
        
        self.W[target_word_id] -= self.learning_rate * gradients
        return loss
    
    def get_word_vector(self, word):
        if word in self.word2id:
            return self.W[self.word2id[word]]
        else:
            return self.W[self.word2id[UNKNOWN_TOKEN]]

    def sigmoid(self,z):
        return 1 / (1 + np.exp(-z))

corpus=sentences_train
sg_model = SkipGramNegativeSampling(corpus)
sg_model.train(epochs=10)


Epoch 1: Loss = 2.5585875701939442
Epoch 2: Loss = 1.8237238593426548
Epoch 3: Loss = 1.786290263232018
Epoch 4: Loss = 1.7433882371003326
Epoch 5: Loss = 1.7090182398404352
Epoch 6: Loss = 1.6788035031284985
Epoch 7: Loss = 1.646608303452957
Epoch 8: Loss = 1.608922379434993
Epoch 9: Loss = 1.5642176757106727
Epoch 10: Loss = 1.514336836986116


In [45]:
sentence_lengths = [len(sentence) for sentence in sentences_train]
sorted_lengths = sorted(sentence_lengths)
index_95th_percentile = int(np.percentile(range(len(sorted_lengths)), 95))
length_95th_percentile = sorted_lengths[index_95th_percentile]

In [46]:
length_sentence=length_95th_percentile

X_train = []
for sentence in sentences_train:
    sentence_embedding = [sg_model.get_word_vector(word) for word in sentence]
    if len(sentence)<length_sentence:
        padding_needed = length_sentence - len(sentence)
        sentence_embedding.extend(padding_needed*[sg_model.get_word_vector(PAD_TOKEN)])
    if sentence_embedding:
        X_train.append((sentence_embedding[:length_sentence]))
y_train = pd.get_dummies(train_data['Class Index'], prefix='value', dtype=int).values

In [47]:
X_test = []
for sentence in sentences_test:
    sentence_embedding = [sg_model.get_word_vector(word) for word in sentence]
    if len(sentence)<length_sentence:
        padding_needed = length_sentence - len(sentence)
        sentence_embedding.extend(padding_needed*[sg_model.get_word_vector(PAD_TOKEN)])
    if sentence_embedding:
        # sentence_embedding = np.array(sentence_embedding[:length_sentence]) 
        X_test.append((sentence_embedding[:length_sentence]))
y_test = pd.get_dummies(test_data['Class Index'], prefix='value', dtype=int).values


In [48]:
class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNClassifier, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        output, _ = self.rnn(x)
        output = self.fc(output[:, -1, :]) 
        return output

In [49]:
input_size = 300  
hidden_size = 128
output_size = 4
model = RNNClassifier(input_size, hidden_size, output_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)


  X_train_tensor = torch.tensor(X_train, dtype=torch.float32)


In [50]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

epochs = 10
for epoch in range(epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")


Epoch 1/10, Loss: 0.27671459317207336
Epoch 2/10, Loss: 0.16225211322307587
Epoch 3/10, Loss: 0.14152804017066956
Epoch 4/10, Loss: 0.26288023591041565
Epoch 5/10, Loss: 0.06347747147083282
Epoch 6/10, Loss: 0.1854410469532013
Epoch 7/10, Loss: 0.025034695863723755
Epoch 8/10, Loss: 0.08606728911399841
Epoch 9/10, Loss: 0.10629647970199585
Epoch 10/10, Loss: 0.1343078911304474


In [62]:
from sklearn.metrics import precision_score, recall_score, f1_score

model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    y_true=torch.argmax(y_test_tensor, dim=1).numpy()
    y_pred=predicted.numpy()
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')

    print("Accuracy:",accuracy)
    print("Precision:", precision)
    print("Recall:",recall)
    print("F1 Score:",f1)

Accuracy: 0.9055263157894737
Precision: 0.9057442508427023
Recall: 0.9055263157894737
F1 Score: 0.9055254208469927


In [58]:
from sklearn.metrics import precision_score, recall_score, f1_score

y_true = []
y_pred = []
correct_samples=0
total_samples=0
for batch_X, batch_y in train_loader:
    outputs = model(batch_X)
    _, predicted = torch.max(outputs, 1)
    y_true.extend(torch.argmax(batch_y, dim=1).numpy())
    y_pred.extend(predicted.numpy())

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

print("Accuracy:",accuracy)
print("Precision:", precision)
print("Recall:",recall)
print("F1 Score:",f1)


In [3]:
import numpy as np
import pandas as pd
import pickle
import torch
import torch.nn as nn
from collections import defaultdict
import random
import re
from nltk.tokenize import word_tokenize

PAD_TOKEN = '<PAD>'
UNKNOWN_TOKEN = '<UNKNOWN>'
START_TOKEN = '<START>'
END_TOKEN = '<END>'

def preprocess_text(data,type='train'):
    sentences=[]
    frequency=dict()
    for text in data:
        text = re.sub(r'[^\w\s\n]', ' ', str(text).lower())
        words = word_tokenize(text)
        words = [START_TOKEN] + words + [END_TOKEN]
        sentences.append(words)
        for word in words:
            frequency[word]=frequency.get(word,0)+1
    
    if type=='train':
        frequency_threshold=3
        for i in range(len(sentences)):
            for j in range(len(sentences[i])):
                if frequency[sentences[i][j]]<frequency_threshold:
                    sentences[i][j]=UNKNOWN_TOKEN
    return sentences

class SkipGramNegativeSampling(nn.Module):
    def __init__(self, corpus, vector_size=300, window_size=5, negative_samples=5, learning_rate=0.025):
        self.corpus = corpus
        self.vector_size = vector_size
        self.window_size = window_size
        self.negative_samples = negative_samples
        self.learning_rate = learning_rate
        self.word2id = {}
        self.id2word = {}
        self.vocab_size = 0
        self.word_count = defaultdict(int)
        self.word_pairs = []
        self.vocab=set()
        self.initialize()

    def initialize(self):
        for sentence in self.corpus:
            for word in sentence:
                self.word_count[word] += 1
                self.vocab.add(word)
        self.vocab.add(PAD_TOKEN)
        self.word_count[PAD_TOKEN]=1
        words = list(self.vocab)
        words.sort()
        for i, word in enumerate(words):
            self.word2id[word] = i
            self.id2word[i] = word
        self.vocab_size = len(self.vocab)

    def similarity(self, vec1, vec2):
        dot_product = np.dot(vec1, vec2)
        return dot_product
    
    def sigmoid(self,z):
        return 1 / (1 + np.exp(-z))

    def generate_word_pairs(self):
        word_pairs=set()
        for sentence in self.corpus:
            for i, target_word in enumerate(sentence):
                target_word_id = self.word2id[target_word]
                for j in range(i - self.window_size, i + self.window_size + 1):
                    if j != i and j >= 0 and j < len(sentence):
                        context_word = sentence[j]
                        context_word_id = self.word2id[context_word]
                        word_pairs.add((target_word_id, context_word_id))
        self.word_pairs=list(word_pairs)
        random.shuffle(self.word_pairs)

    def train(self, epochs=5):
        self.generate_word_pairs()
        self.W = np.random.uniform(-0.5/self.vector_size, 0.5/self.vector_size, (self.vocab_size, self.vector_size))
        self.C = np.random.uniform(-0.5/self.vector_size, 0.5/self.vector_size, (self.vocab_size, self.vector_size))
        for epoch in range(epochs):
            loss = 0
            for target_word_id, context_word_id in self.word_pairs:
                loss += self.train_pair(target_word_id, context_word_id)
            print(f"Epoch {epoch + 1}: Loss = {loss / len(self.word_pairs)}")

    def train_pair(self, target_word_id, context_word_id):
        target_vector = self.W[target_word_id] # w
        context_vector = self.C[context_word_id] # c_pos

        negative_samples = random.choices(range(self.vocab_size), k=self.negative_samples)

        similarity_score = self.similarity(target_vector, context_vector) # w.c_pos
        sigmoid_score = self.sigmoid(similarity_score) # sigmoid(w.c_pos)
        gradients = (sigmoid_score - 1) * context_vector 
        self.C[context_word_id]-=self.learning_rate *(sigmoid_score - 1) * target_vector # C^(t+1)_pos=C^(t)_pos-lr*(sigmoid(c_pos.w)-1)*w
        loss=-np.log(sigmoid_score+1e-10) # log(sigmoid(c_pos.w))

        for sample_word_id in negative_samples :
            sample_vector = self.C[sample_word_id] # c_neg
            similarity_score = self.similarity(target_vector, sample_vector) # w.c_neg
            sigmoid_score = self.sigmoid(similarity_score) # sigmoid(w.c_neg)
            gradients += sigmoid_score * sample_vector
            self.C[sample_word_id]-=self.learning_rate*sigmoid_score*target_vector # C^(t+1)_neg=C^(t)_neg-lr*sigmoid(c_neg.w)*w
            loss+=-np.log(1-sigmoid_score+1e-10) # log(1-sigmoid(c_neg.w))
        
        self.W[target_word_id] -= self.learning_rate * gradients
        return loss

In [4]:
from prettytable import PrettyTable

table = PrettyTable()
table.field_names = ["Context Window Size", "Accuracy", "Precision", "Recall", "F1_Score","Confusion Matrix"]

import warnings
warnings.filterwarnings("ignore")

In [5]:
train_data=pd.read_csv('./train.csv')
sentences_train = preprocess_text(train_data['Description'])
sg_model = SkipGramNegativeSampling(sentences_train,window_size=2)
sg_model.train(epochs=10)
word_vectors_sg=sg_model.W
word_index_sg=sg_model.word2id

test_data = pd.read_csv('./test.csv')

sentences_train= preprocess_text(train_data['Description'])
sentences_test = preprocess_text(test_data['Description'],'test')

sentence_lengths = [len(sentence) for sentence in sentences_train]
sorted_lengths = sorted(sentence_lengths)
index_95th_percentile = int(np.percentile(range(len(sorted_lengths)), 95))
length_95th_percentile = sorted_lengths[index_95th_percentile]
length_sentence = length_95th_percentile

X_train = []
for sentence in sentences_train:
    sentence_embedding = [word_vectors_sg[word_index_sg[word]] for word in sentence]
    if len(sentence) < length_sentence:
        padding_needed = length_sentence - len(sentence)
        sentence_embedding.extend(padding_needed*[word_vectors_sg[word_index_sg[PAD_TOKEN]]])
    if sentence_embedding:
        X_train.append((sentence_embedding[:length_sentence]))
y_train = pd.get_dummies(train_data['Class Index'], prefix='value', dtype=int).values

X_test = []
for sentence in sentences_test:
    sentence_embedding = [word_vectors_sg[word_index_sg.get(word, word_index_sg[UNKNOWN_TOKEN])] for word in sentence]
    if len(sentence) < length_sentence:
        padding_needed = length_sentence - len(sentence)
        sentence_embedding.extend(padding_needed*[word_vectors_sg[word_index_sg[PAD_TOKEN]]])
    if sentence_embedding:
        X_test.append((sentence_embedding[:length_sentence]))
y_test = pd.get_dummies(test_data['Class Index'], prefix='value', dtype=int).values

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNClassifier, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        output, _ = self.rnn(x)
        output = self.fc(output[:, -1, :]) 
        return output
    
input_size = 300  
hidden_size = 128
output_size = 4
model = RNNClassifier(input_size, hidden_size, output_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

epochs = 10
for epoch in range(epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

model.eval()
with torch.no_grad():
    y_true = []
    y_pred = []
    correct_samples=0
    total_samples=0
    for batch_X, batch_y in train_loader:
        outputs = model(batch_X)
        _, predicted = torch.max(outputs, 1)
        y_true.extend(torch.argmax(batch_y, dim=1).numpy())
        y_pred.extend(predicted.numpy())
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)
    print("Train Set:")
    print("Accuracy:",accuracy)
    print("Precision:", precision)
    print("Recall:",recall)
    print("F1 Score:",f1)
    print("Confusion Matrix:",cm)
    print()

model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    y_true=torch.argmax(y_test_tensor, dim=1).numpy()
    y_pred=predicted.numpy()
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)
    print("Test Set:")
    print("Accuracy:",accuracy)
    print("Precision:", precision)
    print("Recall:",recall)
    print("F1 Score:",f1)
    print("Confusion Matrix:",cm)

table.add_row([2,accuracy,precision,recall,f1,cm])

Epoch 1: Loss = 3.0768120968836263
Epoch 2: Loss = 1.846027502287442
Epoch 3: Loss = 1.7637964924873073
Epoch 4: Loss = 1.7382735792856723
Epoch 5: Loss = 1.706115747239165
Epoch 6: Loss = 1.6677122976852459
Epoch 7: Loss = 1.6317361879417591
Epoch 8: Loss = 1.5953168564744111
Epoch 9: Loss = 1.5557750605671734
Epoch 10: Loss = 1.511397127347483
Epoch 1/10, Loss: 0.661766529083252
Epoch 2/10, Loss: 0.1255021095275879
Epoch 3/10, Loss: 0.1881602704524994
Epoch 4/10, Loss: 0.3539079427719116
Epoch 5/10, Loss: 0.17351451516151428
Epoch 6/10, Loss: 0.3443686068058014
Epoch 7/10, Loss: 0.16960179805755615
Epoch 8/10, Loss: 0.07058196514844894
Epoch 9/10, Loss: 0.06119907274842262
Epoch 10/10, Loss: 0.17157040536403656
Train Set:
Accuracy: 0.9497416666666667
Precision: 0.9508833645462933
Recall: 0.9497416666666667
F1 Score: 0.9496591164105659
Confusion Matrix: [[28438   304   355   903]
 [  103 29787    20    90]
 [  699   229 26778  2294]
 [  267    74   693 28966]]

Test Set:
Accuracy: 0.8

In [6]:
train_data=pd.read_csv('./train.csv')
sentences_train = preprocess_text(train_data['Description'])
sg_model = SkipGramNegativeSampling(sentences_train,window_size=5)
sg_model.train(epochs=10)
word_vectors_sg=sg_model.W
word_index_sg=sg_model.word2id

test_data = pd.read_csv('./test.csv')

sentences_train= preprocess_text(train_data['Description'])
sentences_test = preprocess_text(test_data['Description'],'test')

sentence_lengths = [len(sentence) for sentence in sentences_train]
sorted_lengths = sorted(sentence_lengths)
index_95th_percentile = int(np.percentile(range(len(sorted_lengths)), 95))
length_95th_percentile = sorted_lengths[index_95th_percentile]
length_sentence = length_95th_percentile

X_train = []
for sentence in sentences_train:
    sentence_embedding = [word_vectors_sg[word_index_sg[word]] for word in sentence]
    if len(sentence) < length_sentence:
        padding_needed = length_sentence - len(sentence)
        sentence_embedding.extend(padding_needed*[word_vectors_sg[word_index_sg[PAD_TOKEN]]])
    if sentence_embedding:
        X_train.append((sentence_embedding[:length_sentence]))
y_train = pd.get_dummies(train_data['Class Index'], prefix='value', dtype=int).values

X_test = []
for sentence in sentences_test:
    sentence_embedding = [word_vectors_sg[word_index_sg.get(word, word_index_sg[UNKNOWN_TOKEN])] for word in sentence]
    if len(sentence) < length_sentence:
        padding_needed = length_sentence - len(sentence)
        sentence_embedding.extend(padding_needed*[word_vectors_sg[word_index_sg[PAD_TOKEN]]])
    if sentence_embedding:
        X_test.append((sentence_embedding[:length_sentence]))
y_test = pd.get_dummies(test_data['Class Index'], prefix='value', dtype=int).values

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNClassifier, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        output, _ = self.rnn(x)
        output = self.fc(output[:, -1, :]) 
        return output
    
input_size = 300  
hidden_size = 128
output_size = 4
model = RNNClassifier(input_size, hidden_size, output_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

epochs = 10
for epoch in range(epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

model.eval()
with torch.no_grad():
    y_true = []
    y_pred = []
    correct_samples=0
    total_samples=0
    for batch_X, batch_y in train_loader:
        outputs = model(batch_X)
        _, predicted = torch.max(outputs, 1)
        y_true.extend(torch.argmax(batch_y, dim=1).numpy())
        y_pred.extend(predicted.numpy())
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)
    print("Train Set:")
    print("Accuracy:",accuracy)
    print("Precision:", precision)
    print("Recall:",recall)
    print("F1 Score:",f1)
    print("Confusion Matrix:",cm)
    print()

model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    y_true=torch.argmax(y_test_tensor, dim=1).numpy()
    y_pred=predicted.numpy()
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)
    print("Test Set:")
    print("Accuracy:",accuracy)
    print("Precision:", precision)
    print("Recall:",recall)
    print("F1 Score:",f1)
    print("Confusion Matrix:",cm)

table.add_row([10,accuracy,precision,recall,f1,cm])

Epoch 1: Loss = 2.5658881703760787
Epoch 2: Loss = 1.8236889297626786
Epoch 3: Loss = 1.786828297251044
Epoch 4: Loss = 1.7432736420894277
Epoch 5: Loss = 1.7096412387247542
Epoch 6: Loss = 1.679041613918702
Epoch 7: Loss = 1.6467564590376735
Epoch 8: Loss = 1.6090533694581557
Epoch 9: Loss = 1.5642193514899343
Epoch 10: Loss = 1.5135694702942455
Epoch 1/10, Loss: 0.15511079132556915
Epoch 2/10, Loss: 0.1891205757856369
Epoch 3/10, Loss: 0.06072907894849777
Epoch 4/10, Loss: 0.1574212610721588
Epoch 5/10, Loss: 0.19283118844032288
Epoch 6/10, Loss: 0.15719085931777954
Epoch 7/10, Loss: 0.12699761986732483
Epoch 8/10, Loss: 0.08932410925626755
Epoch 9/10, Loss: 0.032409679144620895
Epoch 10/10, Loss: 0.21677207946777344
Train Set:
Accuracy: 0.9677916666666667
Precision: 0.9691314022270019
Recall: 0.9677916666666667
F1 Score: 0.967825186834218
Confusion Matrix: [[29103   136   271   490]
 [   80 29842    22    56]
 [  232    36 27540  2192]
 [  128    23   199 29650]]

Test Set:
Accuracy

In [7]:
train_data=pd.read_csv('./train.csv')
sentences_train = preprocess_text(train_data['Description'])
sg_model = SkipGramNegativeSampling(sentences_train,window_size=10)
sg_model.train(epochs=10)
word_vectors_sg=sg_model.W
word_index_sg=sg_model.word2id

test_data = pd.read_csv('./test.csv')

sentences_train= preprocess_text(train_data['Description'])
sentences_test = preprocess_text(test_data['Description'],'test')

sentence_lengths = [len(sentence) for sentence in sentences_train]
sorted_lengths = sorted(sentence_lengths)
index_95th_percentile = int(np.percentile(range(len(sorted_lengths)), 95))
length_95th_percentile = sorted_lengths[index_95th_percentile]
length_sentence = length_95th_percentile

X_train = []
for sentence in sentences_train:
    sentence_embedding = [word_vectors_sg[word_index_sg[word]] for word in sentence]
    if len(sentence) < length_sentence:
        padding_needed = length_sentence - len(sentence)
        sentence_embedding.extend(padding_needed*[word_vectors_sg[word_index_sg[PAD_TOKEN]]])
    if sentence_embedding:
        X_train.append((sentence_embedding[:length_sentence]))
y_train = pd.get_dummies(train_data['Class Index'], prefix='value', dtype=int).values

X_test = []
for sentence in sentences_test:
    sentence_embedding = [word_vectors_sg[word_index_sg.get(word, word_index_sg[UNKNOWN_TOKEN])] for word in sentence]
    if len(sentence) < length_sentence:
        padding_needed = length_sentence - len(sentence)
        sentence_embedding.extend(padding_needed*[word_vectors_sg[word_index_sg[PAD_TOKEN]]])
    if sentence_embedding:
        X_test.append((sentence_embedding[:length_sentence]))
y_test = pd.get_dummies(test_data['Class Index'], prefix='value', dtype=int).values

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNClassifier, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        output, _ = self.rnn(x)
        output = self.fc(output[:, -1, :]) 
        return output
    
input_size = 300  
hidden_size = 128
output_size = 4
model = RNNClassifier(input_size, hidden_size, output_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

epochs = 10
for epoch in range(epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

model.eval()
with torch.no_grad():
    y_true = []
    y_pred = []
    correct_samples=0
    total_samples=0
    for batch_X, batch_y in train_loader:
        outputs = model(batch_X)
        _, predicted = torch.max(outputs, 1)
        y_true.extend(torch.argmax(batch_y, dim=1).numpy())
        y_pred.extend(predicted.numpy())
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)
    print("Train Set:")
    print("Accuracy:",accuracy)
    print("Precision:", precision)
    print("Recall:",recall)
    print("F1 Score:",f1)
    print("Confusion Matrix:",cm)
    print()

model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    y_true=torch.argmax(y_test_tensor, dim=1).numpy()
    y_pred=predicted.numpy()
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)
    print("Test Set:")
    print("Accuracy:",accuracy)
    print("Precision:", precision)
    print("Recall:",recall)
    print("F1 Score:",f1)
    print("Confusion Matrix:",cm)

table.add_row([5,accuracy,precision,recall,f1,cm])

Epoch 1: Loss = 2.37532437156803
Epoch 2: Loss = 1.8549118276662764
Epoch 3: Loss = 1.8025203860644239
Epoch 4: Loss = 1.765109287291478
Epoch 5: Loss = 1.734151278843685
Epoch 6: Loss = 1.703118658300197
Epoch 7: Loss = 1.6681055772703846
Epoch 8: Loss = 1.62701956210864
Epoch 9: Loss = 1.5812277136384694
Epoch 10: Loss = 1.5341814429360627
Epoch 1/10, Loss: 0.5076616406440735
Epoch 2/10, Loss: 0.5257648229598999
Epoch 3/10, Loss: 0.29216882586479187
Epoch 4/10, Loss: 0.10397431999444962
Epoch 5/10, Loss: 0.27815452218055725
Epoch 6/10, Loss: 0.0899357795715332
Epoch 7/10, Loss: 0.057357918471097946
Epoch 8/10, Loss: 0.05821150168776512
Epoch 9/10, Loss: 0.11028043925762177
Epoch 10/10, Loss: 0.05545838177204132
Train Set:
Accuracy: 0.9737083333333333
Precision: 0.9742444474359087
Recall: 0.9737083333333333
F1 Score: 0.9737964490106757
Confusion Matrix: [[29020    89   565   326]
 [   97 29754   108    41]
 [   54    18 29441   487]
 [   85    14  1271 28630]]

Test Set:
Accuracy: 0.9

In [39]:
print(table)

+---------------------+--------------------+--------------------+--------------------+--------------------+-------------------------+
| Context Window Size |      Accuracy      |     Precision      |       Recall       |      F1_Score      |     Confusion Matrix    |
+---------------------+--------------------+--------------------+--------------------+--------------------+-------------------------+
|          2          | 0.8953947368421052 | 0.8969339297630881 | 0.8953947368421052 | 0.8952788674747271 |  [[1701   45   54  100] |
|                     |                    |                    |                    |                    |   [  26 1835   14   25] |
|                     |                    |                    |                    |                    |   [  82   21 1560  237] |
|                     |                    |                    |                    |                    |  [  58   28  105 1709]] |
|          5          | 0.901578947368421  | 0.903344678407274