# Building and evaluating transformer models and their optimal hyperparameters
#### Evaluate and then pick the one I like to implement in my app

In [1]:
%%capture
pip install pandas transformers datasets torch transformers[torch]

In [None]:
import pandas as pd
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer


In [None]:
training_df = pd.read_csv("/Users/hannahdestiny/Desktop/FakeProfileDetection/dev/csv/structured_train.csv")
testing_df = pd.read_csv("/Users/hannahdestiny/Desktop/FakeProfileDetection/dev/csv/structured_test.csv")
pd.set_option('display.max_colwidth', None)
training_df.head(50)

In [None]:
from CleanDatasets import CleanDatasets
from datasets import Dataset

columns_to_rename_dict = {"screen_name": "username", "account.type": "is_human"}
columns_to_drop_list = ["class_type"]
column_to_binary = "account.type"


training_cleaner = CleanDatasets(training_df, columns_to_rename_dict, columns_to_drop_list, column_to_binary)
clean_training_df = training_cleaner.clean_df()
testing_cleaner = CleanDatasets(testing_df, columns_to_rename_dict, columns_to_drop_list, column_to_binary)
clean_testing_df = testing_cleaner.clean_df()
train_dataset = Dataset.from_pandas(clean_training_df)
test_dataset = Dataset.from_pandas(clean_testing_df)
train_dataset


In [16]:
#initialise variables - build a config for different models i can use

MODEL_NAME = "FacebookAI/roberta-base" 
TOKENIZER_MODEL_NAME = "roberta-base"
MODEL = RobertaForSequenceClassification
TOKENIZER = RobertaTokenizer



In [None]:
# Tokenize the texts - replace 'text' in each column with vectors, attention mask and label so machines can work with it 
tokenizer = RobertaTokenizer.from_pretrained(TOKENIZER_MODEL_NAME)

# Tokenize the texts
def tokenize_function(examples):  
    return tokenizer(examples['text'], padding="max_length", truncation=True)

train_tokenized = train_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)




In [20]:
train_tokenized = train_tokenized.rename_column("is_human", "labels") 
test_tokenized = test_tokenized.rename_column("is_human", "labels") 

train_tokenized = train_tokenized.remove_columns(["username", "text"])
test_tokenized = test_tokenized.remove_columns(["username", "text"])


In [None]:

train_tokenized[0].keys()

In [None]:
pip show torch

In [None]:
#configurate the trainer
"""Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference."""
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=MODEL.from_pretrained(MODEL_NAME),
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized
)

In [None]:
trainer.train()