In [11]:
from wilds import get_dataset
from wilds.common.data_loaders import get_train_loader #shuffles
from wilds.common.data_loaders import get_eval_loader #doesn't shuffle

import torchvision.transforms as transforms

In [12]:
# Load the full dataset, and download it if necessary
dataset = get_dataset(dataset="civilcomments", download=True)

In [13]:
dataset.split_dict
#get_subset("train/val/test")

{'train': 0, 'val': 1, 'test': 2}

In [14]:
#use loader to load validation set, as it's smaller than train set
val_data = dataset.get_subset("val")
val_loader = get_eval_loader("standard", val_data, batch_size = 1)

# Info on Wilds dataset

size of validation set: 45,180 comments

split into 4 groups
1. train data
2. test data
3. Prod pre-shift data
4. Prod post-shift data

Metadata: The metadata contains information like the domain identity, e.g., which camera a photo was taken from, or which hospital the patient's data came from, etc., as well as other metadata.

"""
    The CivilComments-wilds toxicity classification dataset.
    This is a modified version of the original CivilComments dataset.
    Supported `split_scheme`:
        'official'
    Input (x):
        A comment on an online article, comprising one or more sentences of text.
    Label (y):
        y is binary. It is 1 if the comment was been rated as toxic by a majority of the crowdworkers who saw that comment, and 0 otherwise.
    Metadata:
        Each comment is annotated with the following binary indicators:
        Source: https://github.com/p-lambda/wilds/blob/a7a452c80cad311cf0aabfd59af8348cba1b9861/wilds/datasets/civilcomments_dataset.py#L9
        
            - male
            - female
            - LGBTQ
            - christian
            - muslim
            - other_religions
            - black
            - white
            - identity_any
            
            - severe_toxicity
            - obscene
            - threat
            - insult
            - identity_attack
            - sexual_explicit
            
"Next, for each category of demographics (e.g., race, gender), we construct an auxiliary
    attribute (e.g., `na_race`, `na_gender`) that is 1 if the comment has no identities related to
    that demographic, and is 0 otherwise."


In [110]:
texts = []
y_train = []
MAX = 1000 #number of texts to load

#go through data and append to texts
count = 0
for batch in val_loader:
    #print(len(batch)) #3 components
    x, y, domain = batch
    #print(x)
    #print(type(x)) #tupe (text,)
    
    text = x[0]
    texts.append(text)
    y_train.append(y.numpy()[0]) #y is tensor object tensor([1])
    #print(type(y.numpy()[0]))
    
    count +=1
    if count == MAX:
        break

In [111]:
print(len(texts), len(y_train))

1000 1000


In [112]:
#convert to tensor
y_train = tf.convert_to_tensor(y_train)

In [8]:
import tensorflow as tf
import transformers
from transformers import DistilBertTokenizerFast

# Instantiate DistilBERT tokenizer, using FAST to optimize run-time
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# BERT

using bert tokenizer:

inputs = tokenizer.tokenize(text)
input_toks = tokenizer.encode(text)
tokenizer.convert_ids_to_tokens(input_toks)


In [113]:
batch_size = len(texts)
max_length = 264
input_ids = []
attention_mask = []
    
#tokenize the input texts 
batch = texts
inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
input_ids.extend(inputs['input_ids'])
attention_mask.extend(inputs['attention_mask'])

In [133]:
print(len(input_ids), len(input_ids[0]), type(input_ids), type(input_ids[0]))

1000 264 <class 'list'> <class 'torch.Tensor'>


In [127]:
#extract the tokenized input_ids and attention mask from tokenizer
#this is a pytorch implementation with torch tensors
#so we convert to numpy and back to tensorflow tensor
X_train_ids = inputs['input_ids']
X_train_attention = inputs['attention_mask']
print(type(X_train_ids))

X_train_ids = X_train_ids.numpy()
X_train_attention = X_train_attention.numpy()

print(type(X_train_ids))

X_train_ids = tf.convert_to_tensor(X_train_ids)
X_train_attention = tf.convert_to_tensor(X_train_attention)

print(type(X_train_ids))

<class 'torch.Tensor'>
<class 'numpy.ndarray'>
<class 'tensorflow.python.framework.ops.EagerTensor'>


In [115]:
len(X_train_ids)

1000

In [60]:
from transformers import TFDistilBertModel, DistilBertConfig

DISTILBERT_DROPOUT = 0.2 #the dropout we add
DISTILBERT_ATT_DROPOUT = 0.2
 
# Configure DistilBERT's initialization
config = DistilBertConfig(dropout=DISTILBERT_DROPOUT, 
                          attention_dropout=DISTILBERT_ATT_DROPOUT, 
                          output_hidden_states=True)
                          
# The bare, pre-trained DistilBERT transformer model outputting raw hidden-states 
# and without any specific head on top.
#this distilBERT is the transformer object 
#that is passed on to our build_model function at next set[]
distilBERT = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)

# Make DistilBERT layers untrainable
for layer in distilBERT.layers:
    layer.trainable = False

Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

2021-12-21 00:08:17.412688: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'vocab_transform', 'activation_13', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the 

In [None]:
MAX_LENGTH = 264
LAYER_DROPOUT = 0.2 
LEARNING_RATE = 5e-5
RANDOM_STATE = 42

def build_model(transformer, max_length=MAX_LENGTH):
    
    # random seed
    weight_initializer = tf.keras.initializers.GlorotNormal(seed=RANDOM_STATE) 
    
    # input layers with tokenized comments & attention
    input_ids_layer = tf.keras.layers.Input(shape=(max_length,), 
                                            name='input_ids', 
                                            dtype='int32')
    input_attention_layer = tf.keras.layers.Input(shape=(max_length,), 
                                                  name='input_attention', 
                                                  dtype='int32')
    

    
    last_hidden_state = transformer([input_ids_layer, input_attention_layer])[0]
    

    cls_token = last_hidden_state[:, 0, :]

    
    #custom output layer
    output = tf.keras.layers.Dense(1, 
                                   activation='sigmoid',
                                   kernel_initializer=weight_initializer,  
                                   kernel_constraint=None,
                                   bias_initializer='zeros'
                                   )(cls_token)
    
    # Define the model
    model = tf.keras.Model([input_ids_layer, input_attention_layer], output)
    
    # Compile the model
    model.compile(tf.keras.optimizers.Adam(lr=LEARNING_RATE), 
                  loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])
    
    return model

In [None]:
model_1 = build_model(distilBERT)

In [138]:
model_1.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 264)]        0           []                               
                                                                                                  
 input_attention (InputLayer)   [(None, 264)]        0           []                               
                                                                                                  
 tf_distil_bert_model (TFDistil  multiple            66362880    ['input_ids[0][0]',              
 BertModel)                                                       'input_attention[0][0]']        
                                                                                                  
 tf.__operators__.getitem_4 (Sl  (None, 768)         0           ['tf_distil_bert_model[4][7

In [129]:
X_train_ids.shape
type(X_train_ids)
type(y_train)

tensorflow.python.framework.ops.EagerTensor

In [139]:
train_history1 = model_1.fit(x = [X_train_ids, X_train_attention],
                          y = y_train,
                          epochs = 1,
                          batch_size = 32,
                          steps_per_epoch = 1000 // 32,
                          verbose = 1)

