In [1]:
import torch
from transformers import AutoModel, AutoTokenizer

from torch.utils.data import DataLoader
from sklearn.metrics import classification_report
from datasets import load_dataset, DatasetDict, Dataset
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.neural_network import MLPClassifier
import pandas as pd
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
import random

np.random.seed(0)
random.seed(0)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

  from .autonotebook import tqdm as notebook_tqdm


device(type='cuda')

In [2]:
# !pip install datasets

In [3]:
def tokenize_data(data, MAX_LEN = 80):
    # tokenizer vs tokenizer.encode_plus produce the same result but encode_plus doesn't support batch true in map function
    return tokenizer(data["UserStory"], padding="max_length", truncation=True, max_length = MAX_LEN, return_tensors='pt')


In [4]:
def extract_hidden_states(batch):
   # Move the model inputs to the appropriate device.
   inputs_dict = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
   # Call the model and extract the hidden states
   with torch.no_grad():
     last_hidden_state = pre_model(**inputs_dict).last_hidden_state
   # Return the vector for the [CLS] token.
   return {'hidden_state': last_hidden_state[:,0].cpu().numpy()}

In [5]:
def evaluation(labels, preds, target_names = ['Capability', 'Hard-goal', 'Soft-goal', 'Task']):
 
    metricReport = classification_report(labels, preds, target_names=target_names, zero_division=0, output_dict=True)
    return {
        'Accuracy': metricReport['accuracy'],
        'CapP': metricReport[target_names[0]]['precision'],
        'CapR': metricReport[target_names[0]]['recall'],              
        'CapF1': metricReport[target_names[0]]['f1-score'],
        'HGP': metricReport[target_names[1]]['precision'],
        'HGR': metricReport[target_names[1]]['recall'],
        'HGF1': metricReport[target_names[1]]['f1-score'],
        'SGP': metricReport[target_names[2]]['precision'],
        'SGR': metricReport[target_names[2]]['recall'],
        'SGF1': metricReport[target_names[2]]['f1-score'],
        'TP': metricReport[target_names[3]]['precision'],
        'TR': metricReport[target_names[3]]['recall'],
        'TF1': metricReport[target_names[3]]['f1-score'],
}

In [6]:
model_check_point = 'bert-base-uncased'
# model_check_point = 'roberta-base'
pre_model = AutoModel.from_pretrained(model_check_point).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_check_point)

In [7]:
data_dir = "../data/us/newDataset/separate_5_folds_2/"

svm_linear = svm.SVC(C=10, class_weight='balanced', kernel='linear', random_state=0)
svm_poly = svm.SVC(C=100, class_weight='balanced', degree=5, kernel='poly', random_state=0)
svm_rbf = svm.SVC(C=100, class_weight='balanced', kernel='rbf', random_state=0)
svm_sigmoid = svm.SVC(C=100, class_weight='balanced', kernel='sigmoid', random_state=0)
rf = RandomForestClassifier(class_weight='balanced', max_depth=5, criterion = 'gini', random_state=0)
nb_Ber = naive_bayes.BernoulliNB(alpha=1, binarize=0)
nb_Gau = naive_bayes.GaussianNB(var_smoothing=0.8)
lr = LogisticRegression(C=1, class_weight='balanced', solver='liblinear', random_state=0)
mlp = MLPClassifier(activation='relu', alpha=0.0001, early_stopping=True, learning_rate='constant', hidden_layer_sizes=100, random_state=0, solver='lbfgs')


models = [nb_Gau, nb_Ber, lr, svm_linear, svm_rbf,svm_poly, svm_sigmoid, rf, mlp]
# models = [mlp]


results_df = pd.DataFrame()

for model in models:
    results = []
    for iteratorDataset in range(1,6):
        print('Fold: ', str(iteratorDataset))

        dataset = load_dataset('csv', data_files={'train': data_dir + 'train_' + str(iteratorDataset) + '.csv', 'test': data_dir + 'test_' + str(iteratorDataset) + '.csv'}, encoding = "utf-8")
        dataset = dataset.class_encode_column("Label")
        dataset = dataset.map(tokenize_data, batched=True)
        # print(dataset)


        dataset = dataset.remove_columns(['Unnamed: 0', 'UserStory'])
        dataset = dataset.rename_column("Label", "labels")
        dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

        dataset = dataset.map(extract_hidden_states, batched=True)
        # print(dataset['train'].column_names)

        X_train = np.array(dataset['train']['hidden_state'])
        X_test = np.array(dataset['test']['hidden_state'])
        y_train = np.array(dataset['train']['labels'])
        y_test = np.array(dataset['test']['labels'])
        # X_train.shape, X_test.shape

        if type(model).__name__ == "MLPClassifier":
            scaler = StandardScaler()  
            scaler.fit(X_train)  
            X_train = scaler.transform(X_train)  
            X_test = scaler.transform(X_test)  
            
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        # print(classification_report(y_test, y_pred))   
        results.append(evaluation(y_test, y_pred))
        
    if type(model).__name__ == "SVC": 
        name = pd.DataFrame({'modelName': [type(model).__name__ + model.kernel]})
    else: name = pd.DataFrame({'modelName': [type(model).__name__]})
    model_results_df = pd.concat([name.T, pd.DataFrame(results).mean()])
    results_df = pd.concat([results_df, model_results_df], axis=1)
print(results_df)

print('Save to File!')
results_df.to_excel('results/pipeline2_classical_ml_feature_bert.xlsx')
print('Finished')

Fold:  1
Fold:  2
Fold:  3
Fold:  4
Fold:  5


Map: 100%|███████████████████████████████████████████████████████████████████| 792/792 [00:02<00:00, 369.48 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████| 199/199 [00:00<00:00, 272.60 examples/s]


Fold:  1
Fold:  2
Fold:  3
Fold:  4
Fold:  5
Fold:  1
Fold:  2
Fold:  3
Fold:  4
Fold:  5
Fold:  1
Fold:  2
Fold:  3
Fold:  4
Fold:  5
Fold:  1
Fold:  2
Fold:  3
Fold:  4
Fold:  5
Fold:  1
Fold:  2
Fold:  3
Fold:  4
Fold:  5
Fold:  1
Fold:  2
Fold:  3
Fold:  4
Fold:  5
Fold:  1
Fold:  2
Fold:  3
Fold:  4
Fold:  5
Fold:  1
Fold:  2
Fold:  3
Fold:  4
Fold:  5
                    0            0                   0          0         0  \
modelName  GaussianNB  BernoulliNB  LogisticRegression  SVClinear    SVCrbf   
Accuracy     0.592965     0.607035            0.732663   0.725628  0.748744   
CapP         0.774874     0.826701            0.833211   0.804564  0.815726   
CapR         0.740146     0.680292            0.826277   0.845255  0.862774   
CapF1        0.756766     0.745396            0.829328   0.824219  0.838347   
HGP           0.05146     0.083975            0.335714    0.36381  0.416667   
HGR             0.175        0.225                 0.3      0.275     0.225   
HGF1    