In [None]:
import pandas as pd
import numpy as np
import nltk
import math
import time
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F

from ranger import Ranger
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

In [None]:
nltk.download('punkt')

In [None]:
nltk.download('stopwords')

In [None]:
symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"

def prep_stent(sentence):    
    tokens = word_tokenize(sentence)
    
    stops = set(stopwords.words('english'))
    clean_tokens = [x for x in tokens if ((x not in stops) and (x not in symbols))]
    
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(x) for x in clean_tokens]
    return stemmed_tokens

In [None]:
local_corpus = {}

def tf_and_learn_df(tokens):
    uniques, count = np.unique(tokens, return_counts=True)
    u_dict = dict(zip(uniques, count))
    for x in uniques:
        if x not in local_corpus:
            local_corpus.update({x: 1})
        else:
            local_corpus[x] += 1
    tf_s = [(u_dict[x] / len(tokens)) for x in tokens]
    return tf_s

In [None]:
def apply_tf_idf(row):
    idf = [math.log(data_len / (local_corpus[x] + 1)) for x in row['prep_text']]
    tf_idf = [(x * y) for x,y in zip(row['tf'], idf)]
    return tf_idf
    

In [None]:
def get_len(tokens):
    return len(tokens)

In [None]:
onot = pd.read_csv("onion-or-not.csv")
print(onot)

In [None]:
onot['prep_text'] = onot['text'].apply(prep_stent)
print(onot['prep_text'])

In [None]:
onot['tf'] = onot['prep_text'].apply(tf_and_learn_df)
print(onot['tf'])

In [None]:
data_len = len(onot)

onot['tf_idf'] = onot.apply(apply_tf_idf, axis=1)
print(onot['tf_idf'])

In [None]:
onot['len'] = onot['tf_idf'].apply(get_len)
print(onot['len'])

In [None]:
temp_len = onot['len'].sort_values()
temp_ind = int(np.floor(len(temp_len) * 0.9))
max_len = temp_len.iloc[temp_ind]
print(f"Top 90% sent.length: {max_len}")

In [47]:
class onion_words(Dataset):
    def __init__(self, dataframe, max_len):
        self.data = dataframe
        self.total = len(dataframe)
        self.max_len = max_len
    def __len__(self):
        return self.total
    def __getitem__(self, ind):
        label = torch.FloatTensor(1).zero_()
        if self.data.iloc[ind]['label'] == 1:
            label += 1
        x = torch.FloatTensor(self.data.iloc[ind]['tf_idf'])
        if int(x.size()[0]) > self.max_len:
            x = torch.narrow(x, 0, 0, self.max_len)
        else:
            s_pad = int(self.max_len - x.size())
            x = F.pad(x, (0, s_pad))
        return x.view(1, -1), label
        

In [48]:
def weights_init(m):
    if isinstance(m, nn.Conv1d) or isinstance(m, nn.Linear):
        torch.nn.init.kaiming_uniform_(m.weight)
        if m.bias is not None:
            torch.nn.init.zeros_(m.bias)


class Mish(nn.Module):
    # a self regulating activation function
    # credits:
    # Diganta Misra - https://arxiv.org/ftp/arxiv/papers/1908/1908.08681.pdf
    # Less Wright - https://github.com/lessw2020/mish
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x * (torch.tanh(F.softplus(x)))

class onion_net(nn.Module):
    def __init__(self, max_len, num_channels=24, depth=4, kernel_s=3, num_linear=64):
        super().__init__()
        self.max_len = max_len
        self.num_ch = num_channels
        
        layers = []
        norms = []
        self.initial_conv = nn.Conv1d(1, self.num_ch, 1)
        
        for i in range(depth):
            layer = nn.Conv1d(self.num_ch, self.num_ch,kernel_s)
            layers.append(layer)
            
            norm = nn.BatchNorm1d(self.num_ch)
            norms.append(norm)
            
        self.layers = nn.ModuleList(layers)
        self.norms = nn.ModuleList(norms)
        
        self._to_linear = 0
        x_temp = torch.randn(self.max_len * self.num_ch).view(-1, self.num_ch, self.max_len)
        x_temp = self.conv_forward(x_temp)
        self._to_linear = x_temp[0].shape[0] * x_temp[0].shape[1]
        
        self.activ_0 = Mish()
        self.fc_1 = nn.Linear(self._to_linear, num_linear)
        self.activ_1 = Mish()
        self.drop = nn.Dropout(0.2)
        self.fc_out = nn.Linear(num_linear, 1)
        
        
    def conv_forward(self, x):
        for layer, norm in zip(self.layers, self.norms):
            x = layer(x)
            x = norm(x)
        return x
    
    def forward(self,x):
        x = self.initial_conv(x)
        x = self.conv_forward(x)
        x = x.view(-1, self._to_linear)
        
        x = self.activ_0(x)
        x = self.activ_1(self.fc_1(x))
        x = self.drop(x)
        x = self.fc_out(x)
        
        return F.sigmoid(x)

In [49]:
ran_seed = (int(round(time.time()))) % 5000
np.random.seed(ran_seed)

index = onot.index.values
np.random.shuffle(index)

cut = int(np.floor(len(onot) * 0.25))
train_index, test_index = index[cut:], index[:cut]
train = onot.iloc[train_index]
test = onot.iloc[test_index]

In [50]:
learning_rate = 1e-4
batch_size = 64
scale = 1
lr = learning_rate * scale
bat_s = batch_size * scale
epochs = 10

In [51]:
train_set = onion_words(onot[['tf_idf', 'label']], max_len)
test_set = onion_words(onot[['tf_idf', 'label']], max_len)

In [52]:
train_loader = DataLoader(train_set, batch_size=bat_s)
test_loader = DataLoader(test_set, batch_size=bat_s)

In [53]:
channels = 32
depth = 5
kernel = 3

network = onion_net(max_len, num_channels=channels, depth=depth, kernel_s=kernel)
network.apply(weights_init)

onion_net(
  (initial_conv): Conv1d(1, 32, kernel_size=(1,), stride=(1,))
  (layers): ModuleList(
    (0): Conv1d(32, 32, kernel_size=(3,), stride=(1,))
    (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,))
    (2): Conv1d(32, 32, kernel_size=(3,), stride=(1,))
    (3): Conv1d(32, 32, kernel_size=(3,), stride=(1,))
    (4): Conv1d(32, 32, kernel_size=(3,), stride=(1,))
  )
  (norms): ModuleList(
    (0): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (activ_0): Mish()
  (fc_1): Linear(in_features=256, out_features=64, bias=True)
  (activ_1): Mish()
  (drop): Dropout(p=0.2, inplace=False)
  (fc_out

In [54]:
loss_f = nn.BCELoss()
optim = Ranger(network.parameters(), lr=lr, betas=(0.95, 0.999))
target_dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Ranger optimizer loaded. 
Gradient Centralization usage = True
GC applied to both conv and fc layers


In [55]:
# training loop
network.to(target_dev)
for i in range(epochs):
    network.train()
    for data in train_loader:
        sample, label = data
        sample, label = sample.to(target_dev), label.to(target_dev)
        
        optim.zero_grad()
        output = network(sample)
        loss = loss_f(output, label)
        loss.backward()
        optim.step()
    print(f"Epoch: {i} - done")

	addcmul_(Number value, Tensor tensor1, Tensor tensor2)
Consider using one of the following signatures instead:
	addcmul_(Tensor tensor1, Tensor tensor2, *, Number value)


Epoch: 0 - done
Epoch: 1 - done
Epoch: 2 - done
Epoch: 3 - done
Epoch: 4 - done
Epoch: 5 - done
Epoch: 6 - done
Epoch: 7 - done
Epoch: 8 - done
Epoch: 9 - done


In [56]:
results = pd.DataFrame(columns=['pred', 'truth'])
network.eval()
network.to(target_dev)
for data in test_loader:
    sample, label = data
    sample, label = sample.to(target_dev), label.to(target_dev)
    
    output = network(sample)
    matches = [[float(i), float(j)] for i,j, in zip(output, label)]
    
    temp_res = pd.DataFrame(matches, columns=list(results.columns.values))
    results = results.append(temp_res, ignore_index=True)
    
print("Pass finished")

Pass finished


In [57]:
acc = len(results[(((results['pred'] >= 0.5) & (results['truth'] == 1)) | ((results['pred'] < 0.5) & (results['truth'] == 0)))]) / len(results)
precision = len(results[((results['pred'] >= 0.5) & (results['truth'] ==1))]) / len(results[results['pred'] >= 0.5])
recall = len(results[((results['pred'] >= 0.5) & (results['truth'] ==1))]) / len(results[results['truth'] == 1])
f_one = ((precision * recall) / (precision + recall)) * 2

print(f"Accuracy: {round(acc * 100, 2)}%")
print(f"Precision: {round(precision, 3)}")
print(f"Recall: {round(recall, 3)}")
print(f"F1: {round(f_one, 4)}")

Accuracy: 75.27%
Precision: 0.734
Recall: 0.535
F1: 0.6185
