# Installing/importing Libraries

In [None]:
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
import ast
from transformers import AutoTokenizer, TFBertForSequenceClassification
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix, f1_score, classification_report

# Converting Class labels to Numerical Value

In [None]:
def extractList(df):
    df.loc[df["Post"].str.endswith("]") == False, "Post"] = df.loc[df["Post"].str.endswith("]") == False, "Post"] + "']"
    df["Post"] = df["Post"].apply(lambda x: ast.literal_eval(x))
    df["Post"] = df["Post"].apply(lambda x: " ".join(x))

In [None]:
label_conversion = {"Supportive": 0,
                    "Indicator": 1,
                    "Ideation": 2,
                    "Behavior": 3,
                    "Attempt": 4}

# Downloading Data From GitHub and Loading

In [None]:
!git clone https://github.com/hrmoradi/Workshop_data # goes to your cotent folder

In [None]:
dat = pd.read_csv("/content/Workshop_data/500Reddit.txt")
dat

In [None]:
# Rename Labels to integers in order of intensity
dat = dat.replace({"Label":label_conversion})
# Extract list of posts from string of list and concatenate together
extractList(dat)
dat

In [None]:
num_labels = dat["Label"].nunique()
num_labels

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dat["Post"], dat["Label"], test_size=0.33, random_state=42, stratify=dat["Label"], shuffle=True)

# Text To verctor Representation

In [None]:
# Import BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
X_train_input = tokenizer(X_train.values.tolist(), max_length = 512, truncation="longest_first", padding="max_length")
X_test_input = tokenizer(X_test.values.tolist(), max_length = 512, truncation="longest_first", padding="max_length")

In [None]:
train_input_ids = np.asarray(X_train_input["input_ids"])
train_att_mask = np.asarray(X_train_input["attention_mask"])

test_input_ids = np.asarray(X_test_input["input_ids"])
test_att_mask = np.asarray(X_test_input["attention_mask"])

y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

# Freezing transformer weights

In [None]:
model_frozen = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = num_labels)

In [None]:
model_frozen.bert.trainable = False
model_frozen.summary()

In [None]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')

model_frozen.compile(optimizer=Adam(learning_rate=1e-5), 
                      loss=loss,
                      metrics=metrics)

In [None]:
model_frozen.fit(x=[train_input_ids, train_att_mask], y=y_train, epochs=15, batch_size=4)

Since TFBertForSequenceClassification returns logits, we must first convert it to probabilities using softmax.

In [None]:
output = model_frozen.predict([test_input_ids, test_att_mask])
softmax = tf.keras.layers.Softmax(axis=-1)
preds = softmax(output.logits)
pred_labels = preds.numpy().argmax(axis=1)

In [None]:
print('Classification Report')
print(classification_report(y_test,pred_labels))

In [None]:
roc_auc_score(y_test, preds, multi_class='ovr', average='macro')

# Unfreeze transformer weights

In [None]:
model_unfrozen = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = num_labels)

In [None]:
model_unfrozen.bert.trainable = True # Default value set to true
model_unfrozen.summary()

In [None]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')

model_unfrozen.compile(optimizer=Adam(learning_rate=1e-5), 
                      loss=loss,
                      metrics=metrics)

In [None]:
model_unfrozen.fit(x=[train_input_ids, train_att_mask], y=y_train, epochs=10, batch_size=4)

Since TFBertForSequenceClassification returns logits, we must first convert it to probabilities using softmax.

In [None]:
output = model_unfrozen.predict([test_input_ids, test_att_mask])
softmax = tf.keras.layers.Softmax(axis=-1)
preds = softmax(output.logits)
pred_labels = preds.numpy().argmax(axis=1)

In [None]:
print('Classification Report')
print(classification_report(y_test,pred_labels))

In [None]:
roc_auc_score(y_test, preds, multi_class='ovr', average='macro')

# Create your own traditional model

In [None]:
import tensorflow as tf
import pandas as pd

# # uncomment for IMDB dataset # 3 lines below
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=10000)
x_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=512)
x_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=512)

# Input for variable-length sequences of integers
inputs = tf.keras.Input(shape=(None,), dtype="int32")

# # uncomment vectorization for suicidal datae set
# vectorize_layer = tf.keras.layers.TextVectorization(
#     standardize='lower_and_strip_punctuation',
#     split='whitespace',
#     ngrams=None,
#     max_tokens=10000,
#     output_mode='int',
#     output_sequence_length=512,
#     pad_to_max_tokens=True)
# vectorize_layer.adapt(X_train)
# x_train = vectorize_layer(X_train) 
# x_test = vectorize_layer(X_test) 

# Embed each integer in a 100-dimensional vector
x = tf.keras.layers.Embedding(10000, 100)(inputs) # top 10k

# Add 2 bidirectional LSTMs
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(x)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(x)
# Add a classifier
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
model = tf.keras.Model(inputs, outputs)

model.compile("adam", "binary_crossentropy", metrics=["accuracy"])
model.fit(x_train, y_train, batch_size=64, epochs=1) # , validation_data=(x_test, y_test)

print(model.metrics_names)
model.evaluate(x_test, y_test)