In [1]:
!pip install transformers
import pandas as pd
import numpy as np
from tqdm.auto import tqdm  # progress bar
import tensorflow as tf
from datetime import datetime

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m85.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.1-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.1 tokenizers-0.13.2 transformers-4.26.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Tokenize text & create dataset

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # ft -> maps word to numerical value in berts vocab

In [None]:

def DatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

def create_dataset(df, label_name = 'ArgumentLevel', label_number = 5, batch_size = 16, max_length = 256):

    # Tokenize text
    X_input_ids = np.zeros((len(df), max_length))
    X_attn_masks = np.zeros((len(df), max_length))

    for i, text in tqdm(enumerate(df['Content'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length = max_length,
            truncation = True,
            padding = 'max_length',
            add_special_tokens = True, #add [CLS] [PAD] [SEP] tokens
            return_tensors = 'tf'
        )
        
        X_input_ids[i,:] = tokenized_text.input_ids
        X_attn_masks[i,:] = tokenized_text.attention_mask
    

    # Generate labels - One-hot encoding
    labels = np.zeros((len(df), label_number)) 
    labels[np.arange(len(df)), df[label_name].values] = 1


    # Create dataset object
    dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))

    # Map dataset
    dataset = dataset.map(DatasetMapFunction)

    # shuffle data
    dataset = dataset.shuffle(400,reshuffle_each_iteration=True).batch(batch_size, drop_remainder=True) # drop remainder 179/16 = 11...3, drop 3 data

    return dataset

Split training and validation data 

In [None]:
def split_dataset(dataset, p = 0.5, batch_size = 16):
    # p - training data size %
    train_size = int((len(df)//batch_size)*p)
    
    train_dataset = dataset.take(train_size) # take the first 8 batches
    val_dataset = dataset.skip(train_size) # take the last 3 batches

    return train_dataset, val_dataset

Create model

In [None]:
from transformers import TFBertModel
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

In [None]:
def create_model(label_number = 5, max_length = 256):

    # input layer
    input_ids = tf.keras.layers.Input(shape = (max_length), name = 'input_ids', dtype = 'int32')
    # attention layer
    attention_masks = tf.keras.layers.Input(shape = (max_length), name = 'attention_mask', dtype = 'int32')

    # bert layer
    bert_layer = bert_model.bert(input_ids, attention_mask=attention_masks)[1] 

    # intermediate layer
    intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name = 'intermediate_layer')(bert_layer)

    # output layer - 5 layers because 5 label classes
    output_layer = tf.keras.layers.Dense(label_number, activation='softmax', name = 'output_layer')(intermediate_layer)

    # Create model object
    model = tf.keras.Model(inputs=[input_ids, attention_masks], outputs=output_layer)



    # Add optimizer & Loss function & Accuracy metrics
    optim = tf.keras.optimizers.Adam(learning_rate=1e-5)
    loss_func = tf.keras.losses.CategoricalCrossentropy()
    acc = tf.keras.metrics.CategoricalAccuracy('accuracry')

    model.compile(optimizer = optim, loss = loss_func, metrics = [acc])

    return model


========= 分界线 ==========
==========================

In [None]:
year = 'Y1'
label_name = 'ArgumentLevel'
label_number = 5

run_num = 5
e = 4 # epochs = 4 default

In [None]:
folderpath = '/content/drive/MyDrive/Imperial/nlp-physicseducation/outputs'
outputpath = '/content/drive/MyDrive/Imperial/BERT-results/20230309'
filepath = '/sections/labels_cleaned_{year}.csv'.format(year = year)

Load data

In [None]:
df = pd.read_csv(folderpath+filepath)
# df.head()
df['ArgumentLevel'] = df['ArgumentLevel'].replace({'superficial': 0, 'extended': 1, 'deep': 2, 'expert': 3, 'prediction': 4})
df['ReasoningLevel'] = df['ReasoningLevel'].replace({'bal': 0, 'the': 1, 'exp': 2, 'none': 3})
# df['ReasoningLevel'].value_counts()

/Users/jiayangzhang/Library/CloudStorage/GoogleDrive-jiayang.zhang@icloud.com/My Drive/Imperial/nlp-physicseducation/outputs/sections

In [None]:
train_ratios = [2,3,4,5,6,7,8,9] 
for s in train_ratios: # training ratio if p=5, train ratio = 0.5
    for i in range(run_num):
        # about dataset
        dataset = create_dataset(df, label_name = label_name, label_number = label_number, batch_size = 16, max_length=512) # create dataset
        train_dataset, val_dataset = split_dataset(dataset, p = (0.1*s), batch_size = 16) # split training and validation data 


        # about model
        model = create_model(label_number = label_number, max_length = 512)   # Create model


        # about train and validate
        history = model.fit(     # Train & Validate model (Fine-tuning)
            train_dataset,
            validation_data = val_dataset,
            epochs = e
        )


        history_saveto = outputpath + '/{year}/{label_name}/trainsize0{size}/{time}_{year}_{label_name}_{size}trainsize_{e}epochs_16batchsize_1e-5lr.npy'.format(
            time = datetime.now().strftime("%H%M%S"), 
            year = year, 
            label_name = label_name, 
            size = s,
            e = e
            )
        np.save(history_saveto, history.history) # save history


        # clear model after training
        tf.keras.backend.clear_session()
        import gc
        gc.collect()
        del model

Save model

In [None]:
model_saveto = './}_{}_trainsize{}'
model.save('Y1Y2_argument_model')