In [1]:
import pandas as pd
import numpy as np
import pickle
import torch
import torch.nn as nn

## load data

In [2]:
data = pd.read_csv('data/train-balanced-sarcasm.csv')
data = data.dropna().reset_index(drop=True)
y = np.asarray(data['label'])

In [5]:
print(y.shape)

(1010773,)


In [6]:
del data

In [7]:
np.random.seed(1001)
mask = np.random.rand((len(y)))

In [8]:
with open("master_data_all",'rb') as df:
    master_data = pickle.load(df)

In [9]:
# this cell is used for subsets
# master_data = np.hstack((master_data[:, 10:19], master_data[:, 100:]))

In [10]:
df = torch.from_numpy(master_data[mask <= 0.7])
df_valid = torch.from_numpy(master_data[(mask > 0.7) & (mask <= 0.85)])
df_test = torch.from_numpy(master_data[mask > 0.85])
del master_data

In [11]:
type(df)

torch.Tensor

In [12]:
print(df_valid.shape, df_test.shape, df.shape)

torch.Size([151379, 401]) torch.Size([151640, 401]) torch.Size([707754, 401])


In [13]:
print(sum(y[mask <= 0.70])/(len(y)*0.70), 
      sum(y[(mask > 0.7) & (mask <= 0.85)])/(len(y)*0.15),
     sum(y[mask > 0.85])/(len(y)*0.15))

0.5001320771330457 0.49935379490086634 0.4999078263203839


In [14]:
Y = torch.from_numpy(y[mask <= 0.70])
Y_valid = torch.from_numpy(y[(mask > 0.7) & (mask <= 0.85)])
Y_test = torch.from_numpy(y[mask > 0.85])

In [15]:
print(Y_valid.shape, Y_test.shape, Y.shape)

torch.Size([151379]) torch.Size([151640]) torch.Size([707754])


## pytorch net

In [16]:
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.lr_scheduler import MultiStepLR

In [22]:
torch.manual_seed(1001)

input_dim = df.shape[1]
output_dim = 1
train_len = df.shape[0]
device = torch.device("cuda:0" if torch.cuda.is_available()
                      else "cpu")

train = TensorDataset(df, Y)
trainloader = DataLoader(train,
                         batch_size=500,
                         shuffle=True,
                         num_workers=2)

net = torch.nn.Sequential(
    torch.nn.Linear(input_dim, 256),
    torch.nn.PReLU(),
    torch.nn.Dropout(0.3),
    torch.nn.Linear(256, 128),
    torch.nn.PReLU(),
    torch.nn.Dropout(0.2),
    torch.nn.Linear(128, 64),
    torch.nn.PReLU(),
    torch.nn.Dropout(0.2),
    torch.nn.Linear(64, 2),
    torch.nn.Sigmoid()
)

loss_fn = torch.nn.CrossEntropyLoss()
lr = 1e-3
optimizer = torch.optim.Adam(net.parameters(), lr=lr,
                            weight_decay=1e-3/3)
scheduler = MultiStepLR(optimizer, milestones=[50, 80], gamma=0.1)
epochs = 100

In [None]:
for m in net.modules():
    if type(m) in [nn.Linear]:
        nn.init.kaiming_normal_(m.weight.data, a=0.1,
                                mode='fan_in', nonlinearity='leaky_relu')

In [None]:
for epoch in range(epochs):
    scheduler.step()
    running_loss = 0
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data
        net.to(device)
        inputs, labels = inputs.float(), labels.long()
        
        #clear grads
        optimizer.zero_grad()
        
        #forward to get predicted values
        outputs = net.forward(inputs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        
        # back props 
        optimizer.step()
        # update the parameters
        running_loss += loss.item()
        
    print('epoch {}, loss {}' \
          .format(epoch + 1, running_loss / (train_len / 5000))) 

In [30]:
net.eval()

Sequential(
  (0): Linear(in_features=401, out_features=256, bias=True)
  (1): PReLU(num_parameters=1)
  (2): Dropout(p=0.3)
  (3): Linear(in_features=256, out_features=128, bias=True)
  (4): PReLU(num_parameters=1)
  (5): Dropout(p=0.2)
  (6): Linear(in_features=128, out_features=64, bias=True)
  (7): PReLU(num_parameters=1)
  (8): Dropout(p=0.2)
  (9): Linear(in_features=64, out_features=2, bias=True)
  (10): Sigmoid()
)

In [31]:
# TRAINING ACCURACY

with torch.no_grad():
    correct = 0
    total = 0
    tp, fp, fn, tn = 0, 0, 0, 0
    for i, data in enumerate(trainloader, 0): 
        inputs, labels = data 
        net.to(device) 
        inputs, labels = inputs.float(), labels.long()
        
        outputs = net(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        tp += (predicted[labels == 1] == labels[labels == 1]).sum().item()
        fn += (predicted[labels == 1] != labels[labels == 1]).sum().item()
        fp += (predicted[labels == 0] != labels[labels == 0]).sum().item()
        tn += (predicted[labels == 0] == labels[labels == 0]).sum().item()
        precision, recall = (tp / (tp+fp)), (tp / (tp+fn))
        f1 = 2 * (precision * recall) / (precision + recall)

print('Accuracy of the network on the ~750000 train comments: {} %' \
      .format(100 * correct / total), \
      'and F1 score is: {} '.format(f1), 'and the CM values are:' \
      '\n true positive: {} '.format(tp), \
     '\n false positive: {} '.format(fp), \
     '\n true negative: {} '.format(tn), \
     '\n false negative: {} '.format(fn))

Accuracy of the network on the ~750000 train comments: 73.46238947430886 % and F1 score is: 0.7277060714176982  and the CM values are:
 true positive: 250976  
 false positive: 84933  
 true negative: 268957  
 false negative: 102888 


In [32]:
# VALIDATION ACCURACY

valid = TensorDataset(df_valid, Y_valid)
validloader = DataLoader(valid,
                        batch_size=500) 

with torch.no_grad():
    correct = 0
    total = 0
    tp, fp, fn, tn = 0, 0, 0, 0
    for i, data in enumerate(validloader, 0): 
        inputs, labels = data 
        net.to(device) 
        inputs, labels = inputs.float(), labels.long()
        
        outputs = net(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        tp += (predicted[labels == 1] == labels[labels == 1]).sum().item()
        fn += (predicted[labels == 1] != labels[labels == 1]).sum().item()
        fp += (predicted[labels == 0] != labels[labels == 0]).sum().item()
        tn += (predicted[labels == 0] == labels[labels == 0]).sum().item()
        precision, recall = (tp / (tp+fp)), (tp / (tp+fn))
        f1 = 2 * (precision * recall) / (precision + recall)

print('Accuracy of the network on the ~150000 validaton comments: {} %' \
      .format(100 * correct / total), \
      'and F1 score is: {} '.format(f1), 'and the CM values are:' \
      '\n true positive: {} '.format(tp), \
     '\n false positive: {} '.format(fp), \
     '\n true negative: {} '.format(tn), \
     '\n false negative: {} '.format(fn))

Accuracy of the network on the ~150000 validaton comments: 69.89542803162922 % and F1 score is: 0.691914548404543  and the CM values are:
 true positive: 51174  
 false positive: 21036  
 true negative: 54633  
 false negative: 24536 


In [33]:
# TESTING ACCURACY

test = TensorDataset(df_test, Y_test)
testloader = DataLoader(test,
                        batch_size=500) 

with torch.no_grad():
    correct = 0
    total = 0
    tp, fp, fn, tn = 0, 0, 0, 0
    for i, data in enumerate(testloader, 0): 
        inputs, labels = data 
        net.to(device) 
        inputs, labels = inputs.float(), labels.long()
        
        outputs = net(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        tp += (predicted[labels == 1] == labels[labels == 1]).sum().item()
        fn += (predicted[labels == 1] != labels[labels == 1]).sum().item()
        fp += (predicted[labels == 0] != labels[labels == 0]).sum().item()
        tn += (predicted[labels == 0] == labels[labels == 0]).sum().item()
        precision, recall = (tp / (tp+fp)), (tp / (tp+fn))
        f1 = 2 * (precision * recall) / (precision + recall)

print('Accuracy of the network on the ~150000 test comments: {} %' \
      .format(100 * correct / total), \
      'and F1 score is: {} '.format(f1), 'and the CM values are:' \
      '\n true positive: {} '.format(tp), \
     '\n false positive: {} '.format(fp), \
     '\n true negative: {} '.format(tn), \
     '\n false negative: {} '.format(fn))

Accuracy of the network on the ~150000 test comments: 69.49024004220523 % and F1 score is: 0.6869438711641912  and the CM values are:
 true positive: 50760  
 false positive: 21231  
 true negative: 54615  
 false negative: 25034 


In [None]:
torch.save(net.state_dict(), 
           './models/NN_weights_100e_decay_50_80_w_decay_lrelu.pt')