In [193]:
import Instance
import Vocab
import EmbeddingMatrix
import NLPDataset
import Collate
import Frequencies
import torch
from torch import nn
import numpy as np
from torch.utils.data import DataLoader
import torch.nn.functional as F
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

TRAIN_FILE_PATH="data/train_copn.csv"
VALID_FILE_PATH="data/valid_copn.csv"
TEST_FILE_PATH="data/test_copn.csv"
MAX_VOCAB_SIZE=-1
MIN_VOCAB_FREQ=1
EMBEDDINGS_PATH='data/sst_glove_6b_300d.txt'

In [194]:
class BaselineModel(nn.Module):

    def __init__(self, embeddings):

        super().__init__()

        self.embeddings=embeddings

        self.fc1=nn.Linear(300,500, bias=True)
        self.fc2=nn.Linear(500, 150, bias=True)
        self.fc_logits=nn.Linear(150,1, bias=True)

        #self.reset_parameters()

    def reset_parameters(self):
        for m in self.modules():
            if isinstance(m, nn.Linear) and m is not self.fc_logits:
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
                nn.init.constant_(m.bias, 0)
        self.fc_logits.reset_parameters()

    def forward(self, x):

        x=self.embeddings(x)
        x=torch.mean(x, dim=1)

        h=self.fc1(x)
        h=torch.relu(h)
        h=self.fc2(h)
        h=torch.relu(h)
        
        return self.fc_logits(h).squeeze()


def train(model, train_dataloader, criterion, optimizer):

    model.train()

    for x, y, _ in train_dataloader:
        model.zero_grad()
        optimizer.zero_grad()

        y=y.float()
        logits=model.forward(x)

        loss=criterion(logits, y)
        loss.backward()
        optimizer.step()


def eval(model, eval_dataloader, criterion):

    model.eval()

    with torch.no_grad():

        predictions=torch.empty(0)
        labels=torch.empty(0)
        for x,y,_ in eval_dataloader:
            y=y.float()
            logits=model.forward(x)
            prediction=torch.where(logits>0, 1, 0)
            predictions=torch.cat((predictions, prediction), 0)

            labels=torch.cat((labels, y), 0)
        
        cm=confusion_matrix(labels, predictions)
        return cm, accuracy_score(labels, predictions), f1_score(labels, predictions, average="macro")

In [195]:
word_vocab, label_vocab=Frequencies.get_frequency_vocab(TRAIN_FILE_PATH)

word_vocab=Vocab.Vocab(word_vocab, MAX_VOCAB_SIZE, MIN_VOCAB_FREQ, True)
label_vocab=Vocab.Vocab(label_vocab, MAX_VOCAB_SIZE, MIN_VOCAB_FREQ)

train_dataset=NLPDataset.NLPDataset(word_vocab, label_vocab, TRAIN_FILE_PATH)
train_dataloader=DataLoader(train_dataset, shuffle=True, batch_size=32, collate_fn=Collate.pad_collate_fn)

valid_dataset=NLPDataset.NLPDataset(word_vocab, label_vocab, VALID_FILE_PATH)
valid_dataloader=DataLoader(valid_dataset, shuffle=True, batch_size=32, collate_fn=Collate.pad_collate_fn)

test_dataset=NLPDataset.NLPDataset(word_vocab, label_vocab, TEST_FILE_PATH)
test_dataloader=DataLoader(test_dataset, shuffle=True, batch_size=32, collate_fn=Collate.pad_collate_fn)

In [196]:
SEED=0

torch.manual_seed(SEED)
np.random.seed(SEED)

embeddings=EmbeddingMatrix.embedding_matrix(word_vocab, SEED, EMBEDDINGS_PATH, True)
model=BaselineModel(embeddings)

criterion=nn.BCEWithLogitsLoss()

optimizer=torch.optim.Adam(model.parameters(), lr=1e-4)
#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)

for epoch in range(20):

    train(model, train_dataloader, criterion, optimizer)
    cm, acc, f1=eval(model, valid_dataloader, criterion)
    print(f"Epoch {epoch+1}: confusion matrix:\n {cm}\naccuracy: {acc}\nf1 score: {f1}\n\n")

cm, acc, f1=eval(model, test_dataloader, criterion)
print(f"Test metrics: confusion matrix:\n {cm}\naccuracy: {acc}\nf1 score: {f1}\n\n")

Epoch 1: confusion matrix:
 [[252   0]
 [241   0]]
accuracy: 0.5111561866125761
f1 score: 0.338255033557047


Epoch 2: confusion matrix:
 [[252   0]
 [241   0]]
accuracy: 0.5111561866125761
f1 score: 0.338255033557047


Epoch 3: confusion matrix:
 [[252   0]
 [241   0]]
accuracy: 0.5111561866125761
f1 score: 0.338255033557047


Epoch 4: confusion matrix:
 [[252   0]
 [241   0]]
accuracy: 0.5111561866125761
f1 score: 0.338255033557047


Epoch 5: confusion matrix:
 [[245   7]
 [224  17]]
accuracy: 0.5314401622718052
f1 score: 0.4039567686389448


Epoch 6: confusion matrix:
 [[243   9]
 [209  32]]
accuracy: 0.5578093306288032
f1 score: 0.4586456318504191


Epoch 7: confusion matrix:
 [[247   5]
 [234   7]]
accuracy: 0.5152129817444219
f1 score: 0.3646393348036387


Epoch 8: confusion matrix:
 [[243   9]
 [216  25]]
accuracy: 0.5436105476673428
f1 score: 0.43268124280782505


Epoch 9: confusion matrix:
 [[209  43]
 [151  90]]
accuracy: 0.6064908722109533
f1 score: 0.5821449792038027


Epoc

In [197]:
TRAIN_FILE_PATH="data/train_cagr.csv"
VALID_FILE_PATH="data/valid_cagr.csv"
TEST_FILE_PATH="data/test_cagr.csv"
MAX_VOCAB_SIZE=-1
MIN_VOCAB_FREQ=1
EMBEDDINGS_PATH='data/sst_glove_6b_300d.txt'


word_vocab, label_vocab=Frequencies.get_frequency_vocab(TRAIN_FILE_PATH)

word_vocab=Vocab.Vocab(word_vocab, MAX_VOCAB_SIZE, MIN_VOCAB_FREQ, True)
label_vocab=Vocab.Vocab(label_vocab, MAX_VOCAB_SIZE, MIN_VOCAB_FREQ)

train_dataset=NLPDataset.NLPDataset(word_vocab, label_vocab, TRAIN_FILE_PATH)
train_dataloader=DataLoader(train_dataset, shuffle=True, batch_size=32, collate_fn=Collate.pad_collate_fn)

valid_dataset=NLPDataset.NLPDataset(word_vocab, label_vocab, VALID_FILE_PATH)
valid_dataloader=DataLoader(valid_dataset, shuffle=True, batch_size=32, collate_fn=Collate.pad_collate_fn)

test_dataset=NLPDataset.NLPDataset(word_vocab, label_vocab, TEST_FILE_PATH)
test_dataloader=DataLoader(test_dataset, shuffle=True, batch_size=32, collate_fn=Collate.pad_collate_fn)

torch.manual_seed(SEED)
np.random.seed(SEED)

embeddings=EmbeddingMatrix.embedding_matrix(word_vocab, SEED, EMBEDDINGS_PATH, True)
model=BaselineModel(embeddings)

criterion=nn.BCEWithLogitsLoss()

optimizer=torch.optim.Adam(model.parameters(), lr=1e-3)
#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)

for epoch in range(20):

    train(model, train_dataloader, criterion, optimizer)
    cm, acc, f1=eval(model, valid_dataloader, criterion)
    print(f"Epoch {epoch+1}: confusion matrix:\n {cm}\naccuracy: {acc}\nf1 score: {f1}\n\n")

cm, acc, f1=eval(model, test_dataloader, criterion)
print(f"Test metrics: confusion matrix:\n {cm}\naccuracy: {acc}\nf1 score: {f1}\n\n")

Epoch 1: confusion matrix:
 [[267   0]
 [226   0]]
accuracy: 0.5415821501014199
f1 score: 0.35131578947368425


Epoch 2: confusion matrix:
 [[267   0]
 [226   0]]
accuracy: 0.5415821501014199
f1 score: 0.35131578947368425


Epoch 3: confusion matrix:
 [[267   0]
 [226   0]]
accuracy: 0.5415821501014199
f1 score: 0.35131578947368425


Epoch 4: confusion matrix:
 [[267   0]
 [226   0]]
accuracy: 0.5415821501014199
f1 score: 0.35131578947368425


Epoch 5: confusion matrix:
 [[267   0]
 [226   0]]
accuracy: 0.5415821501014199
f1 score: 0.35131578947368425


Epoch 6: confusion matrix:
 [[267   0]
 [226   0]]
accuracy: 0.5415821501014199
f1 score: 0.35131578947368425


Epoch 7: confusion matrix:
 [[267   0]
 [226   0]]
accuracy: 0.5415821501014199
f1 score: 0.35131578947368425


Epoch 8: confusion matrix:
 [[267   0]
 [226   0]]
accuracy: 0.5415821501014199
f1 score: 0.35131578947368425


Epoch 9: confusion matrix:
 [[267   0]
 [226   0]]
accuracy: 0.5415821501014199
f1 score: 0.351315789473

In [198]:
TRAIN_FILE_PATH="data/train_ccon.csv"
VALID_FILE_PATH="data/valid_ccon.csv"
TEST_FILE_PATH="data/test_ccon.csv"
MAX_VOCAB_SIZE=-1
MIN_VOCAB_FREQ=1
EMBEDDINGS_PATH='data/sst_glove_6b_300d.txt'


word_vocab, label_vocab=Frequencies.get_frequency_vocab(TRAIN_FILE_PATH)

word_vocab=Vocab.Vocab(word_vocab, MAX_VOCAB_SIZE, MIN_VOCAB_FREQ, True)
label_vocab=Vocab.Vocab(label_vocab, MAX_VOCAB_SIZE, MIN_VOCAB_FREQ)

train_dataset=NLPDataset.NLPDataset(word_vocab, label_vocab, TRAIN_FILE_PATH)
train_dataloader=DataLoader(train_dataset, shuffle=True, batch_size=32, collate_fn=Collate.pad_collate_fn)

valid_dataset=NLPDataset.NLPDataset(word_vocab, label_vocab, VALID_FILE_PATH)
valid_dataloader=DataLoader(valid_dataset, shuffle=True, batch_size=32, collate_fn=Collate.pad_collate_fn)

test_dataset=NLPDataset.NLPDataset(word_vocab, label_vocab, TEST_FILE_PATH)
test_dataloader=DataLoader(test_dataset, shuffle=True, batch_size=32, collate_fn=Collate.pad_collate_fn)

torch.manual_seed(SEED)
np.random.seed(SEED)

embeddings=EmbeddingMatrix.embedding_matrix(word_vocab, SEED, EMBEDDINGS_PATH, True)
model=BaselineModel(embeddings)

criterion=nn.BCEWithLogitsLoss()

optimizer=torch.optim.Adam(model.parameters(), lr=1e-3)
#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)

for epoch in range(20):

    train(model, train_dataloader, criterion, optimizer)
    cm, acc, f1=eval(model, valid_dataloader, criterion)
    print(f"Epoch {epoch+1}: confusion matrix:\n {cm}\naccuracy: {acc}\nf1 score: {f1}\n\n")

cm, acc, f1=eval(model, test_dataloader, criterion)
print(f"Test metrics: confusion matrix:\n {cm}\naccuracy: {acc}\nf1 score: {f1}\n\n")

Epoch 1: confusion matrix:
 [[241   0]
 [252   0]]
accuracy: 0.48884381338742394
f1 score: 0.3283378746594005


Epoch 2: confusion matrix:
 [[241   0]
 [252   0]]
accuracy: 0.48884381338742394
f1 score: 0.3283378746594005


Epoch 3: confusion matrix:
 [[241   0]
 [252   0]]
accuracy: 0.48884381338742394
f1 score: 0.3283378746594005


Epoch 4: confusion matrix:
 [[240   1]
 [250   2]]
accuracy: 0.4908722109533469
f1 score: 0.33616051071591424


Epoch 5: confusion matrix:
 [[212  29]
 [203  49]]
accuracy: 0.5294117647058824
f1 score: 0.4716555801921656


Epoch 6: confusion matrix:
 [[230  11]
 [231  21]]
accuracy: 0.5091277890466531
f1 score: 0.4015789896071586


Epoch 7: confusion matrix:
 [[153  88]
 [144 108]]
accuracy: 0.5294117647058824
f1 score: 0.5254580456718003


Epoch 8: confusion matrix:
 [[186  55]
 [181  71]]
accuracy: 0.5212981744421906
f1 score: 0.4937517404622668


Epoch 9: confusion matrix:
 [[209  32]
 [198  54]]
accuracy: 0.5334685598377282
f1 score: 0.4822941778069982

In [199]:
TRAIN_FILE_PATH="data/train_cext.csv"
VALID_FILE_PATH="data/valid_cext.csv"
TEST_FILE_PATH="data/test_cext.csv"
MAX_VOCAB_SIZE=-1
MIN_VOCAB_FREQ=1
EMBEDDINGS_PATH='data/sst_glove_6b_300d.txt'


word_vocab, label_vocab=Frequencies.get_frequency_vocab(TRAIN_FILE_PATH)

word_vocab=Vocab.Vocab(word_vocab, MAX_VOCAB_SIZE, MIN_VOCAB_FREQ, True)
label_vocab=Vocab.Vocab(label_vocab, MAX_VOCAB_SIZE, MIN_VOCAB_FREQ)

train_dataset=NLPDataset.NLPDataset(word_vocab, label_vocab, TRAIN_FILE_PATH)
train_dataloader=DataLoader(train_dataset, shuffle=True, batch_size=32, collate_fn=Collate.pad_collate_fn)

valid_dataset=NLPDataset.NLPDataset(word_vocab, label_vocab, VALID_FILE_PATH)
valid_dataloader=DataLoader(valid_dataset, shuffle=True, batch_size=32, collate_fn=Collate.pad_collate_fn)

test_dataset=NLPDataset.NLPDataset(word_vocab, label_vocab, TEST_FILE_PATH)
test_dataloader=DataLoader(test_dataset, shuffle=True, batch_size=32, collate_fn=Collate.pad_collate_fn)

torch.manual_seed(SEED)
np.random.seed(SEED)

embeddings=EmbeddingMatrix.embedding_matrix(word_vocab, SEED, EMBEDDINGS_PATH, True)
model=BaselineModel(embeddings)

criterion=nn.BCEWithLogitsLoss()

optimizer=torch.optim.Adam(model.parameters(), lr=1e-3)
#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)

for epoch in range(20):

    train(model, train_dataloader, criterion, optimizer)
    cm, acc, f1=eval(model, valid_dataloader, criterion)
    print(f"Epoch {epoch+1}: confusion matrix:\n {cm}\naccuracy: {acc}\nf1 score: {f1}\n\n")

cm, acc, f1=eval(model, test_dataloader, criterion)
print(f"Test metrics: confusion matrix:\n {cm}\naccuracy: {acc}\nf1 score: {f1}\n\n")

Epoch 1: confusion matrix:
 [[251   0]
 [242   0]]
accuracy: 0.5091277890466531
f1 score: 0.33736559139784944


Epoch 2: confusion matrix:
 [[251   0]
 [242   0]]
accuracy: 0.5091277890466531
f1 score: 0.33736559139784944


Epoch 3: confusion matrix:
 [[251   0]
 [242   0]]
accuracy: 0.5091277890466531
f1 score: 0.33736559139784944


Epoch 4: confusion matrix:
 [[251   0]
 [242   0]]
accuracy: 0.5091277890466531
f1 score: 0.33736559139784944


Epoch 5: confusion matrix:
 [[251   0]
 [242   0]]
accuracy: 0.5091277890466531
f1 score: 0.33736559139784944


Epoch 6: confusion matrix:
 [[251   0]
 [242   0]]
accuracy: 0.5091277890466531
f1 score: 0.33736559139784944


Epoch 7: confusion matrix:
 [[251   0]
 [242   0]]
accuracy: 0.5091277890466531
f1 score: 0.33736559139784944


Epoch 8: confusion matrix:
 [[244   7]
 [234   8]]
accuracy: 0.5111561866125761
f1 score: 0.3658334801150769


Epoch 9: confusion matrix:
 [[246   5]
 [239   3]]
accuracy: 0.5050709939148073
f1 score: 0.3462391304347

In [200]:
TRAIN_FILE_PATH="data/train_cneu.csv"
VALID_FILE_PATH="data/valid_cneu.csv"
TEST_FILE_PATH="data/test_cneu.csv"
MAX_VOCAB_SIZE=-1
MIN_VOCAB_FREQ=1
EMBEDDINGS_PATH='data/sst_glove_6b_300d.txt'


word_vocab, label_vocab=Frequencies.get_frequency_vocab(TRAIN_FILE_PATH)

word_vocab=Vocab.Vocab(word_vocab, MAX_VOCAB_SIZE, MIN_VOCAB_FREQ, True)
label_vocab=Vocab.Vocab(label_vocab, MAX_VOCAB_SIZE, MIN_VOCAB_FREQ)

train_dataset=NLPDataset.NLPDataset(word_vocab, label_vocab, TRAIN_FILE_PATH)
train_dataloader=DataLoader(train_dataset, shuffle=True, batch_size=32, collate_fn=Collate.pad_collate_fn)

valid_dataset=NLPDataset.NLPDataset(word_vocab, label_vocab, VALID_FILE_PATH)
valid_dataloader=DataLoader(valid_dataset, shuffle=True, batch_size=32, collate_fn=Collate.pad_collate_fn)

test_dataset=NLPDataset.NLPDataset(word_vocab, label_vocab, TEST_FILE_PATH)
test_dataloader=DataLoader(test_dataset, shuffle=True, batch_size=32, collate_fn=Collate.pad_collate_fn)

torch.manual_seed(SEED)
np.random.seed(SEED)

embeddings=EmbeddingMatrix.embedding_matrix(word_vocab, SEED, EMBEDDINGS_PATH, True)
model=BaselineModel(embeddings)

criterion=nn.BCEWithLogitsLoss()

optimizer=torch.optim.Adam(model.parameters(), lr=1e-4)
#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)

for epoch in range(20):

    train(model, train_dataloader, criterion, optimizer)
    cm, acc, f1=eval(model, valid_dataloader, criterion)
    print(f"Epoch {epoch+1}: confusion matrix:\n {cm}\naccuracy: {acc}\nf1 score: {f1}\n\n")

cm, acc, f1=eval(model, test_dataloader, criterion)
print(f"Test metrics: confusion matrix:\n {cm}\naccuracy: {acc}\nf1 score: {f1}\n\n")

Epoch 1: confusion matrix:
 [[238   0]
 [255   0]]
accuracy: 0.4827586206896552
f1 score: 0.32558139534883723


Epoch 2: confusion matrix:
 [[238   0]
 [255   0]]
accuracy: 0.4827586206896552
f1 score: 0.32558139534883723


Epoch 3: confusion matrix:
 [[238   0]
 [255   0]]
accuracy: 0.4827586206896552
f1 score: 0.32558139534883723


Epoch 4: confusion matrix:
 [[238   0]
 [255   0]]
accuracy: 0.4827586206896552
f1 score: 0.32558139534883723


Epoch 5: confusion matrix:
 [[230   8]
 [241  14]]
accuracy: 0.4949290060851927
f1 score: 0.3749420804203816


Epoch 6: confusion matrix:
 [[224  14]
 [238  17]]
accuracy: 0.48884381338742394
f1 score: 0.37944055944055943


Epoch 7: confusion matrix:
 [[224  14]
 [241  14]]
accuracy: 0.4827586206896552
f1 score: 0.3681043885618928


Epoch 8: confusion matrix:
 [[206  32]
 [210  45]]
accuracy: 0.5091277890466531
f1 score: 0.4505268781548212


Epoch 9: confusion matrix:
 [[218  20]
 [221  34]]
accuracy: 0.5111561866125761
f1 score: 0.43204122508879