# Pipeline Method to predict Answer Category DBPedia
1. Upload saved model
2. Read test data
3. Load fine-tuned model for category classifier for DBPedia
4. Predict category
5. write prediction to csv file

In [None]:
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import transformers
from transformers import TFRobertaForSequenceClassification,RobertaTokenizer

In [None]:
#read test data
url = 'https://raw.githubusercontent.com/smart-task/smart-2022-datasets/main/AT_answer_type_prediction/dbpedia/SMART2022-AT-dbpedia-test.json'

test = pd.read_json(url)

In [None]:
def classifyQuestion(test):
    #create new model
    trained_bert = TFRobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=5)
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    #load saved model
    trained_bert.load_weights('../input/trainmodel/dbpedia_category_bert.h5')

    question = test['question'].astype(str).values.tolist()

    #preprocessing - tokenize, add special tokens and padding
    test_input_ids=[]
    test_attention_masks=[]

    #embedding
    test_inp = tokenizer.batch_encode_plus(question, add_special_tokens=True, max_length=64, padding=True, return_attention_mask = True)

    #append to list
    test_input_ids.append(test_inp['input_ids'])
    test_attention_masks.append(test_inp['attention_mask'])


    test_input_ids=np.asarray(test_input_ids)
    test_attention_masks=np.array(test_attention_masks)

    #reshape to drop the first dimension
    test_attention_masks = tf.squeeze(test_attention_masks)
    test_input_ids = tf.squeeze(test_input_ids)

    #make predictions
    test_predict = trained_bert.predict([test_input_ids,test_attention_masks])
    #get one-hot encoding
    test_predict = np.argmax(test_predict.logits, axis=-1)

    category = []
    Labels = ["resource","literal-number","literal-string","literal-date","boolean"]

    #get category labels from one hot encoded value
    for i in range(0,len(test_predict)):
        category.append(Labels[test_predict[i]])

    #add the result to the dataframe
    test['category'] = category
    category = test['category'].str.split(pat="-", expand=True)
    #category
    test['category'] = category[0]
    test['type'] = list(category[1])
    return test

In [None]:
result = classifyQuestion(test)
#write result to output file
result.to_csv('smart22-dbpedia_train_atype.tsv',sep='\t',index=False)