In [1]:
%matplotlib inline
import torch
import pandas as pd
import numpy as np
import csv
import json
import time
import random
import simplejson

import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

from torch.nn import functional as F
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader, Dataset

import data_loader
from sentence_bert import sentbert
from gan_model import Generator, Discriminator


In [2]:
# load all new types of hate data
all = []
with open('./data/train_new.json') as f:
    for line in f:
        all.append(json.loads(line))
print('we have total', len(all), 'samples')

we have total 770 samples


In [3]:
# for save the training time, we generate the sentence embedding before train the GAN model
for d in tqdm(all):
  d['sent_emb'] = sentbert(d['tweet']).numpy().tolist()

  0%|          | 0/770 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [4]:
# load the data with dataloader
'''
    dataloader: 
        input args: 'dataset', 'batch_size', 'num_worker'
        return: 'data_loader', 'dataset'(normalized)
'''
data = data_loader.get_data_loaders(all, 16, 2)
train_loader = data['data_loader']
train_dataset = data['dataset']


In [5]:
# Training Process
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

input_len = 6 # attribute length
input_dim = 768 # feature dimension
epochs = 3000

In [6]:
# Models
generator = Generator(input_len).to(device)
discriminator = Discriminator(input_dim).to(device)

# Optimizers
generator_optimizer = torch.optim.SGD(generator.parameters(), lr=1e-3, momentum=0.9)
discriminator_optimizer = torch.optim.SGD(discriminator.parameters(), lr=1e-3, momentum=0.9)

# loss
loss = nn.BCELoss().to(device)
gener_Loss = []
discr_Loss = []

In [7]:
generator.train()
discriminator.train()

for epoch in tqdm(range(epochs)):
  for i, batch in tqdm(enumerate(train_loader)):
    '''
      keys:
      'text': text, 'category': category, 'label':label, 'attributes': attributes, 'vector': sam_vector
    '''

    # Generate the data
    text = batch['text']
    category = batch['category']
    true_labels = batch['label'].float().to(device)
    attributes = batch['attributes'].float().to(device)
    sent_emb = batch['sent_emb'].float().to(device)
      # print(true_data)
  
    generator_optimizer.zero_grad()

    # Train the generatorloss
    generated_data = generator(attributes)
    generator_discriminator_out = discriminator(generated_data)
    generator_loss = loss(generator_discriminator_out, true_labels)

    if i % len(train_loader) == 0:
      gener_Loss.append(generator_loss.item())
      print('Generator Loss:', generator_loss.item())

    generator_loss.backward()
    generator_optimizer.step()

    # Train the discriminator on the true and generated data
    discriminator_optimizer.zero_grad()
    true_discriminator_out = discriminator(sent_emb)
    true_discriminator_loss = loss(true_discriminator_out, true_labels)

    # Get the final loss from discriminator
    generator_discriminator_out = discriminator(generated_data.detach())
    generator_discriminator_loss = loss(generator_discriminator_out, true_labels)

    discriminator_loss = (true_discriminator_loss + generator_discriminator_loss) / 2
    
    if i % len(train_loader) == 0:
      discr_Loss.append(discriminator_loss.item())
      print('Discriminator Loss:', discriminator_loss.item())

    # if i % len(train_loader) == 0:
    #   discr_Loss.append(generator_discriminator_loss.item())
    #   print('Discriminator Loss:', generator_discriminator_loss.item())

    discriminator_loss.backward()
    discriminator_optimizer.step()


  0%|          | 0/3000 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


0it [00:00, ?it/s]

Generator Loss: 0.7066943049430847
Discriminator Loss: 0.7070766687393188
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


0it [00:00, ?it/s]

Generator Loss: 0.694818913936615
Discriminator Loss: 0.6945250034332275
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


0it [00:00, ?it/s]

Generator Loss: 0.7026135325431824
Discriminator Loss: 0.702525794506073
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


0it [00:00, ?it/s]

Generator Loss: 0.6799271702766418
Discriminator Loss: 0.6799885630607605
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


0it [00:00, ?it/s]

Generator Loss: 0.6652306914329529
Discriminator Loss: 0.6650593876838684
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


0it [00:00, ?it/s]

Generator Loss: 0.660176157951355
Discriminator Loss: 0.6598796844482422
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


0it [00:00, ?it/s]

Generator Loss: 0.6564423441886902
Discriminator Loss: 0.6563023924827576
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


0it [00:00, ?it/s]

Generator Loss: 0.7181248664855957
Discriminator Loss: 0.7183345556259155
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


0it [00:00, ?it/s]

Generator Loss: 0.6854331493377686
Discriminator Loss: 0.6854116916656494
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


0it [00:00, ?it/s]

Generator Loss: 0.629292368888855
Discriminator Loss: 0.6290034651756287
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


0it [00:00, ?it/s]

Generator Loss: 0.7676782011985779
Discriminator Loss: 0.7679437398910522
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


0it [00:00, ?it/s]

Generator Loss: 0.7297016978263855
Discriminator Loss: 0.7296843528747559



KeyboardInterrupt



In [None]:
plt.plot(gener_Loss)

In [None]:
plt.plot(discr_Loss)

In [None]:
# save the models
torch.save(generator.state_dict(), './model/GAN_generator.pt')

torch.save(discriminator.state_dict(), './model/GAN_discriminator.pt')