In [None]:
# bow

# # Part 1: Environment Setup

import os
os.getcwd()

# import general_module which locate at my parent directory's child
import sys
sys.path.append("..")
from general_module.evaluation import *
from general_module.training import *
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pickle

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4') 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import sklearn
import random


In [None]:
import re

def clearing(data):
    cleaned_text=[]
    for sentence in data:
        sentence=sentence.lower()
        # removing links from text data
        sentence=re.sub('https?://[^\s<>"]+|www\.[^\s<>"]+',' ',sentence)
    
        # removing other symbols
        sentence=re.sub('[^0-9a-z]',' ',sentence)
        cleaned_text.append(sentence)
    return cleaned_text

def expand(np_data):
    temp=[]
    for i in range(len(np_data)):
        temp.append(np.array(np_data[i]))
    return temp

class Lemmatizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    def __call__(self, sentence):
        return [self.lemmatizer.lemmatize(word) for word in sentence.split() if len(word)>2]


In [None]:

class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __getitem__(self, index):
        
        sen = self.dataframe["content"].values[index]
        aware = np.array(self.dataframe[["O","C","E","A"]].values[index],np.float64)
        fea = np.concatenate((sen,aware),axis=0)

        fea = torch.tensor(fea, dtype=torch.float64)
        fea = fea.type(torch.FloatTensor)


        label = torch.tensor(float(str(self.dataframe["label"].values[index])), dtype=torch.float64)
        label = label.type(torch.FloatTensor)

        return fea, label

    def __len__(self):
        return len(self.dataframe)

In [None]:

# # Part 3 Model Training
class CustomNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(5004, 100)
        self.fc2 = nn.Linear(100, 5)
        self.fc3 = nn.Linear(5, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x


def train(epochs, trainloader, validationloader, testloader):

    checkpoint = Checkpoint()


    network = CustomNetwork()
    network = best_device(network)
    loss_function = nn.BCELoss()
    model = CustomModel(network)

    optimizer = torch.optim.Adam(network.parameters(), lr=0.0005)

    for e in range(epochs):
        # a dictionary that store the training loss, validation loss, train_size, validation_size, TP, FP, TN, FN
        running_info = {'train_loss':0, 'validation_loss':0, 'train_size':0, 'validation_size':0, 'TP':0, 'FP':0, 'TN':0, 'FN':0}

        # set to training mode
        network.train(True)

        # per epoch training activity
        for inputs, labels in trainloader:
            inputs,labels = best_device(inputs, labels)

            # clear all the gradient to 0
            optimizer.zero_grad()

            # forward propagation
            outs = network(inputs)
            outs = outs.view(-1)
            
            # compute loss
            loss = loss_function(outs,labels)
            
            # backpropagation
            loss.backward()

            # update w
            optimizer.step()

            # update running_info
            running_info['train_loss'] += loss.item()*labels.size(0)
            running_info['train_size'] += labels.size(0)


        # Turn off training mode for reporting validation loss
        network.train(False)

        # per epoch validation activity
        for inputs, labels in validationloader:

 
            inputs,labels = best_device(inputs, labels)

            # forward propagation
            outs = network(inputs)
            outs = outs.view(-1)
            
            loss = loss_function(outs,labels)

            # update running_info
            running_info['validation_loss'] += loss.item()*labels.size(0)
            running_info['validation_size'] += labels.size(0)

            preds = (outs > 0.5).type(torch.FloatTensor)
            tp,fp,tn,fn = e_confusion_matrix(preds,labels)
            running_info['TP'] += tp
            running_info['FP'] += fp
            running_info['TN'] += tn
            running_info['FN'] += fn

        
        train_loss = running_info['train_loss']/running_info['train_size']
        validation_loss = running_info['validation_loss']/running_info['validation_size']
        confusion_matrix=(running_info['TP'],running_info['FP'],running_info['TN'],running_info['FN'])
        regular_accuracy,balanced_accuracy = e_accuracy(confusion_matrix)

        print(f'[Epoch {e + 1:2d}/{epochs:d}]: train_loss = {train_loss:.4f}, validation_loss = {validation_loss:.4f}, RA = {regular_accuracy:.4f}, BA: {balanced_accuracy:.4f}, CM:{confusion_matrix}')

        model.update(network, epochs = e+1, ba = balanced_accuracy, ra=regular_accuracy)

        checkpoint.add(network.state_dict(),optimizer.state_dict())

    m_dict = checkpoint.get(model.getOptEpoch())
    network.load_state_dict(m_dict)

    network.eval()


    running_info = {'test_loss':0, 'test_size':0, 'TP':0, 'FP':0, 'TN':0, 'FN':0}
     # per epoch test activity
    for inputs, labels in testloader:
        inputs,labels = best_device(inputs, labels)

        # forward propagation
        outs = network(inputs)
        outs = outs.view(-1)

        # update running_info
        running_info['test_loss'] += loss.item()*labels.size(0)
        running_info['test_size'] += labels.size(0)

        preds = (outs > 0.5).type(torch.FloatTensor)
        tp,fp,tn,fn = e_confusion_matrix(preds,labels)
        running_info['TP'] += tp
        running_info['FP'] += fp
        running_info['TN'] += tn
        running_info['FN'] += fn
    
    confusion_matrix=(running_info['TP'],running_info['FP'],running_info['TN'],running_info['FN'])
    regular_accuracy,balanced_accuracy = e_accuracy(confusion_matrix)
    print(confusion_matrix)
    
    model.override(network=network, ba = balanced_accuracy, ra=regular_accuracy)
    
    return model

In [None]:

dataset = extract("../../corpus/personality-aware-sentiment/bow-movie-review.pickle")
trainset, testset = train_test_split(dataset, test_size=0.2, random_state=42, stratify=dataset.label)
trainset.content = clearing(trainset.content)
vectorizer=TfidfVectorizer(max_features=5000,stop_words='english',tokenizer=Lemmatizer())
vectorizer.fit(trainset.content)

trainset.content=expand(vectorizer.transform(trainset.content).toarray())
testset.content=expand(vectorizer.transform(testset.content).toarray())

trainset, validationset = train_test_split(trainset, test_size=0.2, random_state=42, stratify=trainset.label)


In [None]:
torch.manual_seed(42)

custom_trainset = CustomDataset(dataframe=trainset)
custom_validationset = CustomDataset(dataframe=validationset)
custom_textset = CustomDataset(dataframe=testset)
batch_size = 32
trainloader = DataLoader(custom_trainset, batch_size=batch_size, shuffle=False)
validationloader = DataLoader(custom_validationset, batch_size=batch_size, shuffle=False)
testloader = DataLoader(custom_textset,batch_size=batch_size, shuffle=False)
model = train(epochs = 20, trainloader=trainloader,testloader = testloader, validationloader=validationloader)

In [None]:
torch.manual_seed(42)

custom_trainset = CustomDataset(dataframe=trainset)
custom_validationset = CustomDataset(dataframe=validationset)
custom_textset = CustomDataset(dataframe=testset)
batch_size = 32
trainloader = DataLoader(custom_trainset, batch_size=batch_size, shuffle=False)
validationloader = DataLoader(custom_validationset, batch_size=batch_size, shuffle=False)
testloader = DataLoader(custom_textset,batch_size=batch_size, shuffle=False)
model = train(epochs = 20, trainloader=trainloader,testloader = testloader, validationloader=validationloader)

In [None]:
torch.manual_seed(42)

custom_trainset = CustomDataset(dataframe=trainset)
custom_validationset = CustomDataset(dataframe=validationset)
custom_textset = CustomDataset(dataframe=testset)
batch_size = 32
trainloader = DataLoader(custom_trainset, batch_size=batch_size, shuffle=False)
validationloader = DataLoader(custom_validationset, batch_size=batch_size, shuffle=False)
testloader = DataLoader(custom_textset,batch_size=batch_size, shuffle=False)
model = train(epochs = 20, trainloader=trainloader,testloader = testloader, validationloader=validationloader)