In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.2-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.2


In [2]:
import pandas as pd
import numpy as np
import ast
from transformers import AutoTokenizer, TFBertForSequenceClassification
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix, f1_score, classification_report

In [3]:
def extractList(df):
    df.loc[df["Post"].str.endswith("]") == False, "Post"] = df.loc[df["Post"].str.endswith("]") == False, "Post"] + "']"
    df["Post"] = df["Post"].apply(lambda x: ast.literal_eval(x))
    df["Post"] = df["Post"].apply(lambda x: " ".join(x))

In [4]:
label_conversion = {"Supportive": 0,
                    "Indicator": 1,
                    "Ideation": 2,
                    "Behavior": 3,
                    "Attempt": 4}

In [5]:
!git clone https://github.com/hrmoradi/Workshop_data # goes to your cotent folder

Cloning into 'Workshop_data'...
remote: Enumerating objects: 5862, done.[K
remote: Counting objects: 100% (22/22), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 5862 (delta 2), reused 21 (delta 2), pack-reused 5840[K
Receiving objects: 100% (5862/5862), 1.13 GiB | 16.75 MiB/s, done.
Resolving deltas: 100% (4/4), done.
Updating files: 100% (6260/6260), done.


In [6]:
dat = pd.read_csv("/content/Workshop_data/500Reddit.txt")
dat

Unnamed: 0,User,Post,Label
0,user-0,"['Its not a viable option, and youll be leavin...",Supportive
1,user-1,['It can be hard to appreciate the notion that...,Ideation
2,user-2,"['Hi, so last night i was sitting on the ledge...",Behavior
3,user-3,['I tried to kill my self once and failed badl...,Attempt
4,user-4,['Hi NEM3030. What sorts of things do you enjo...,Ideation
...,...,...,...
495,user-495,"['Its not the end, it just feels that way. Or ...",Supportive
496,user-496,"['It was a skype call, but she ended it and Ve...",Indicator
497,user-497,['That sounds really weird.Maybe you were Dist...,Supportive
498,user-498,['Dont know there as dumb as it sounds I feel ...,Attempt


In [7]:
# Rename Labels to integers in order of intensity
dat = dat.replace({"Label":label_conversion})
# Extract list of posts from string of list and concatenate together
extractList(dat)
dat

Unnamed: 0,User,Post,Label
0,user-0,"Its not a viable option, and youll be leaving ...",0
1,user-1,It can be hard to appreciate the notion that y...,2
2,user-2,"Hi, so last night i was sitting on the ledge o...",3
3,user-3,I tried to kill my self once and failed badly ...,4
4,user-4,Hi NEM3030. What sorts of things do you enjoy ...,2
...,...,...,...
495,user-495,"Its not the end, it just feels that way. Or at...",0
496,user-496,"It was a skype call, but she ended it and Vent...",1
497,user-497,That sounds really weird.Maybe you were Distra...,0
498,user-498,Dont know there as dumb as it sounds I feel Hy...,4


In [8]:
num_labels = dat["Label"].nunique()
num_labels

5

In [9]:
X_train, X_test, y_train, y_test = train_test_split(dat["Post"], dat["Label"], test_size=0.33, random_state=42, stratify=dat["Label"], shuffle=True)

In [10]:
# Import BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:
X_train_input = tokenizer(X_train.values.tolist(), max_length = 512, truncation="longest_first", padding="max_length")
X_test_input = tokenizer(X_test.values.tolist(), max_length = 512, truncation="longest_first", padding="max_length")

In [12]:
train_input_ids = np.asarray(X_train_input["input_ids"])
train_att_mask = np.asarray(X_train_input["attention_mask"])

test_input_ids = np.asarray(X_test_input["input_ids"])
test_att_mask = np.asarray(X_test_input["attention_mask"])

y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

# Freezing transformer weights

In [13]:
model_frozen = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = num_labels)

Downloading tf_model.h5:   0%|          | 0.00/536M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
model_frozen.bert.trainable = False
model_frozen.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  3845      
                                                                 
Total params: 109,486,085
Trainable params: 3,845
Non-trainable params: 109,482,240
_________________________________________________________________


In [15]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')

model_frozen.compile(optimizer=Adam(learning_rate=1e-5), 
                      loss=loss,
                      metrics=metrics)

In [16]:
model_frozen.fit(x=[train_input_ids, train_att_mask], y=y_train, epochs=15, batch_size=4)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7fa0f44938e0>

Since TFBertForSequenceClassification returns logits, we must first convert it to probabilities using softmax.

In [17]:
output = model_frozen.predict([test_input_ids, test_att_mask])
softmax = tf.keras.layers.Softmax(axis=-1)
preds = softmax(output.logits)
pred_labels = preds.numpy().argmax(axis=1)



In [18]:
print('Classification Report')
print(classification_report(y_test,pred_labels))

Classification Report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        36
           1       0.00      0.00      0.00        33
           2       0.34      1.00      0.51        56
           3       0.00      0.00      0.00        25
           4       0.00      0.00      0.00        15

    accuracy                           0.34       165
   macro avg       0.07      0.20      0.10       165
weighted avg       0.12      0.34      0.17       165



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
roc_auc_score(y_test, preds, multi_class='ovr', average='macro')

0.5649207643355789

# Unfrozen transformer weights

In [20]:
model_unfrozen = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = num_labels)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
model_unfrozen.bert.trainable = True # Default value set to true
model_unfrozen.summary()

Model: "tf_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_75 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  3845      
                                                                 
Total params: 109,486,085
Trainable params: 109,486,085
Non-trainable params: 0
_________________________________________________________________


In [22]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')

model_unfrozen.compile(optimizer=Adam(learning_rate=1e-5), 
                      loss=loss,
                      metrics=metrics)

In [23]:
model_unfrozen.fit(x=[train_input_ids, train_att_mask], y=y_train, epochs=10, batch_size=4)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa0e121fa60>

Since TFBertForSequenceClassification returns logits, we must first convert it to probabilities using softmax.

In [24]:
output = model_unfrozen.predict([test_input_ids, test_att_mask])
softmax = tf.keras.layers.Softmax(axis=-1)
preds = softmax(output.logits)
pred_labels = preds.numpy().argmax(axis=1)



In [25]:
print('Classification Report')
print(classification_report(y_test,pred_labels))

Classification Report
              precision    recall  f1-score   support

           0       0.62      0.50      0.55        36
           1       0.29      0.33      0.31        33
           2       0.43      0.62      0.51        56
           3       0.53      0.32      0.40        25
           4       1.00      0.13      0.24        15

    accuracy                           0.45       165
   macro avg       0.58      0.38      0.40       165
weighted avg       0.51      0.45      0.44       165



In [26]:
roc_auc_score(y_test, preds, multi_class='ovr', average='macro')

0.6940348031467958