In [35]:
### IMPORTS
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import asyncio

!pip install mpyc
from mpyc.runtime import mpc

In [None]:
### TRAIN AND SAVE MODEL
train_csv_path = "./data/training_set.csv"
test_csv_path = "./data/test_set.csv"

train_df = pd.read_csv(train_csv_path, delimiter=';')
test_df = pd.read_csv(test_csv_path, delimiter=';')

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)

max_length = 128
train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding=True, max_length=max_length, return_tensors='tf')
test_encodings = tokenizer(test_df['text'].tolist(), truncation=True, padding=True, max_length=max_length, return_tensors='tf')

train_input_ids = np.array(train_encodings['input_ids'])
train_token_type_ids = np.array(train_encodings['token_type_ids'])
train_attention_mask = np.array(train_encodings['attention_mask'])

test_input_ids = np.array(test_encodings['input_ids'])
test_token_type_ids = np.array(test_encodings['token_type_ids'])
test_attention_mask = np.array(test_encodings['attention_mask'])

label_encoder = LabelEncoder()
label_encodings_train = label_encoder.fit_transform(train_df['label'])
label_encodings_test = label_encoder.transform(test_df['label'])

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

history = model.fit(
    x={'input_ids': train_input_ids, 'token_type_ids': train_token_type_ids, 'attention_mask': train_attention_mask},
    y=label_encodings_train,
    epochs=3,
    batch_size=16
)

test_loss, test_accuracy = model.evaluate(
    x={'input_ids': test_input_ids, 'token_type_ids': test_token_type_ids, 'attention_mask': test_attention_mask},
    y=label_encodings_test,
    batch_size=16
)

print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.2%}')

model.save_pretrained("best_model")

In [None]:
# MPC version of Chararacter Counting Example
async def mainmpc(file_path):
    secint = mpc.SecInt(16)

    await mpc.start()

    with open(file_path, 'r') as file:
        text = file.read()

    ascii_values = [secint(ord(char)) for char in text]

    count = mpc.input(secint(len(ascii_values)))

    print('Number of Characters:', await mpc.output(count))
    await mpc.shutdown()

# Regular version of Character Counting Example
def mainreg(file_path):
    with open(file_path, 'r') as file:
        text = file.read()

    ascii_values = [ord(char) for char in text]

    count = len(ascii_values)

    print('Number of Characters:', count)

# Make prediction based on tuned BERT model from above
def new_predictions(file_path, k):

  loaded_model = TFBertForSequenceClassification.from_pretrained("best_model")
  tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

  text_df = pd.read_csv(file_path, delimiter=';', nrows=k)

  max_length = 128
  new_encodings = tokenizer(text_df['text'].tolist(), truncation=True, padding=True, max_length=max_length, return_tensors='tf')

  new_input_ids = np.array(new_encodings['input_ids'])
  new_token_type_ids = np.array(new_encodings['token_type_ids'])
  new_attention_mask = np.array(new_encodings['attention_mask'])

  predictions = loaded_model.predict(
      x={'input_ids': new_input_ids, 'token_type_ids': new_token_type_ids, 'attention_mask': new_attention_mask}
  )

  predicted_labels = np.argmax(predictions.logits, axis=1)

  # Uncomment if needed, but these are correct labels for the specified index. Not sure how/why the order is set.
  # decode_label = {}
  # decode_label[0] = "Health"
  # decode_label[1] = "Other"
  # decode_label[2] = "Politics"
  # decode_label[3] = "Religion"
  # decode_label[4] = "Sexuality"

  return predicted_labels

# Read the first k lines of a CSV file and return
def read_first_x_lines(file_path, k):
    with open(file_path, 'r') as file:
        first_k_lines = []

        for _ in range(k):
            line = file.readline().strip()
            first_k_lines.append(line)

    return first_k_lines

# Get first k lines, make predictions on them to determine if MPC is needed or not
def main(file_path, threshold, k):
  first_k_list = read_first_x_lines(file_path, k)

  if len(first_k_list) < k:
    print("File is empty or contains fewer than ", k, + " lines. Try different value for k.")
    return

  preds = new_predictions(file_path, k)

  count_sens = 0
  for x in preds:
    if x != 1:
      count_sens += 1

  # if there is more than threshold% of sensitive info in the first k lines sampled, use MPC
  if int(count_sens/len(preds)) > int(threshold/100):
    print("=== MPC ===")
    mpc.run(mainmpc(file_path))
  else:
    print("=== REG ===")
    mainreg(file_path)

main("./data/newtest.csv", 50, 10) # Should use MPC
main("./data/newtest2.csv", 50, 10) # Should use Regular