In [None]:
import pandas as pd
import numpy as np

from dotenv import load_dotenv

from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn

import os

import math
from sklearn.metrics import mean_squared_error, f1_score

In [None]:
load_dotenv()
data_path = os.getenv("DATA_PATH")
model_path = os.getenv("MODEL_PATH") + '/RNN'

In [None]:
data = pd.read_csv(f'{data_path}/processed/Final_processed_data.csv')
data.head(10)

In [None]:
data.rename(columns={'label(CyberBullying,Normal)': 'label'}, inplace=True)
data.head()

In [None]:
parentIdx = list(data[data['s.no.'] == 1].index)
parentIdx.append(len(data))
len(parentIdx)

In [None]:
# Extract and encode labels
labels = data[data['s.no.'] == 1]['label'].values
# labels
labels = [label.lower().replace(" ","") for label in labels]

In [None]:
data.drop(columns=['s.no.','isParent','authorName','text','likeCount','repliedTo','label'], inplace=True,errors='ignore')
data.rename(columns={'negative_prob':'sentiment'},inplace=True)
data.head(5)

In [None]:
# Separating different threads
threads = []
for i in range(len(parentIdx)-1):
    if i == len(parentIdx)-1:
        threads.append(data.iloc[parentIdx[i]:].copy())
    else:
        threads.append(data.iloc[parentIdx[i]:parentIdx[i+1]].copy())

In [None]:
x_data = [thread.to_numpy() for thread in threads]

In [None]:
label_dict = {'normal':0, 'cyberbullying':1}
labels = [label_dict[label] for label in labels]

y_data = to_categorical(labels)
y_data.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=42)

In [None]:
# Parameters of the model

num_layers = 2
hidden_size = 256

batch_size = 1

In [None]:
class RNN_model(nn.Module):
    def __init__(self, input_dim , hidden_size , num_layers):
        super(RNN_model, self).__init__()
        self.num_layers = num_layers
        self.input_size = input_dim
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size=input_dim , hidden_size = hidden_size , num_layers= num_layers)
        self.fc = nn.Linear(hidden_size,2)
        self.sig = nn.Sigmoid()

    def forward(self,x,hn):
        out , hn = self.rnn(x, hn)
        final_out = self.fc(out[-1])
        final_out=self.sig(final_out)
        return final_out,hn

    def predict(self,x):
        hn = self.init()
        final_out = self.fc(out[-1])
        return final_out

    def init(self):
        h0 =  torch.zeros(self.num_layers , batch_size , self.hidden_size).to(device)
        return h0

In [None]:
w1 = np.count_nonzero(labels)
w2 = len(labels) - w1
print(w1, " ", w2)

In [None]:
input_dim = x_train[0].shape[1]

device = 'cuda'
model = Lstm_model(input_dim , hidden_size , num_layers).to(device)

loss_fn = nn.CrossEntropyLoss(weight=torch.tensor([w1, w2]).to(device))
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [None]:
def Train(epoch, X, Y):
    h = model.init()
    h=h.float()

    model.train()
    avg_loss = 0
    for i, x in enumerate(X):
    # x is input which is in shape of (seq_len,feature)

        y = torch.from_numpy(Y[i]).unsqueeze(0)
        y=y.float()

        y = y.to(device)

        first=x.shape[0]
        second=x.shape[1]
        x_batch=x.reshape(first,1,second)
        x_batch=torch.from_numpy(x_batch)
        x_batch=x_batch.float()
        
        x_batch = x_batch.to(device)

        out,h = model(x_batch,h)  
        loss = loss_fn(out , y)

        h = h.detach()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss=loss.item()
        avg_loss += loss
    
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(epoch+1, avg_loss / len(X)))

def Test(epoch, X, Y):
    model.eval()
    with torch.no_grad():
        h = model.init()
        avg_loss = 0
        
        pred_arr = []
        y_arr = []

        for i, x in enumerate(x_test):
            # x = x.to(device)

            first=x.shape[0]
            second=x.shape[1]
            x_batch=x.reshape(first,1,second)
            x_batch=torch.from_numpy(x_batch)
            x_batch=x_batch.float()

            x_batch = x_batch.to(device)

            pred = model(x_batch, h)[0]
            # print(pred.numpy())
            # pred = scalar.inverse_transform(pred.detach().cpu().numpy()).reshape(-1)
            pred_arr = pred_arr + list(pred.detach().cpu().numpy())
            y_arr.append(y_test[i])
        
    pred_vals = np.argmax(pred_arr, axis=1)
    y_vals = np.argmax(y_arr, axis=1)
    f_score = f1_score(y_vals, pred_vals)
    
    print('Epoch: {} \ttest F1 Score: {:.6f}'.format(epoch+1, f_score))

    return f_score

In [None]:
epoch = 100

os.makedirs(model_path, exist_ok=True)
os.makedirs(model_path+'/best', exist_ok=True)

max_f = 0
max_i = 0
for i in range(epoch):
    print(f'Epoch {i+1}: ')
    Train(i, x_train, y_train)
    f_score = Test(i, x_test, y_test)
    f_score = round(f_score, 5)

    if f_score > max_f:
        max_f = f_score
        max_i = i

    print('Max F1 Score: {:.6f} on epoch {}'.format(max_f, max_i+1))
    filename = "epoch_" + str(i+1)+ "_fscore_" + str(f_score) + '.tar.pth'

    torch.save(model.state_dict(), os.path.join(model_path+'/', filename))

# Saving the best model
filename = "epoch_" + str(max_i+1)+ "_fscore_" + str(max_f) + '.tar.pth'
torch.save(model.state_dict(), os.path.join(model_path+'/best/', filename))

In [None]:
# Evaluating the best Model
path = model_path+'/best/'
filename = "epoch_" + str(max_i+1)+ "_fscore_" + str(max_f) + '.tar.pth'

model.load_state_dict(torch.load(os.path.join(path,filename), map_location=lambda storage, loc: storage))

model.eval()
with torch.no_grad():
    h = model.init()
    avg_loss = 0
    
    pred_arr = []
    y_arr = []

    for i, x in enumerate(x_test):

        first=x.shape[0]
        second=x.shape[1]
        x_batch=x.reshape(first,1,second)
        x_batch=torch.from_numpy(x_batch)
        x_batch=x_batch.float()

        x_batch = x_batch.to(device)

        pred = model(x_batch, h)[0]
        pred_arr = pred_arr + list(pred.detach().cpu().numpy())
        y_arr.append(y_test[i])
    
pred_vals = np.argmax(pred_arr, axis=1)
y_vals = np.argmax(y_arr, axis=1)
f_score = f1_score(y_vals, pred_vals)

print('test F1 Score: {:.3f}'.format(f_score*100))

In [None]:
def confusion(y, pred):
    tp, fp, tn, fn = 0, 0, 0, 0
    for i in range(len(y)):
        if y[i] == 1 and pred[i] == 1:
            tp += 1
        elif y[i] == 0 and pred[i] == 1:
            fp += 1
        elif y[i] == 0 and pred[i] == 0:
            tn += 1
        elif y[i] == 1 and pred[i] == 0:
            fn += 1
    return tp, fp, tn, fn
tp, fp, tn, fn = confusion(y_vals, pred_vals)
print(tp, fp, '\n', fn, tn)

In [None]:
prec = tp / (tp + fp)
print('Precision: {:.3f}%'.format(prec*100))

rec = tp / (tp + fn)
print('Recall: {:.3f}%'.format(rec*100))

acc = (tp + tn) / (tp + tn + fp + fn)
print('Accuracy: {:.3f}%'.format(acc*100))