In [1]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.neural_network import MLPClassifier
from lib.preprocessingtext import *
# from lib.evaluation import * 
from lib.findSimilaritiesSent import *
from lib.io import *
from lib.bert import *
# from lib.training import *
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import random

np.random.seed(0)
random.seed(0)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def convertClassToNum(y):
    return [target_names.index(i) for i in y]
    
#     Encoder = LabelEncoder()
#     return Encoder.fit_transform(y)
    

In [3]:
def convertSentToTFVec(fullX):

    # tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1, 2), stop_words = 'english')
    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1, 2))
    return tfidf.fit_transform(fullX)

In [4]:
target_names = ['Capability', 'Hard-goal', 'Soft-goal', 'Task']
labels = pd.read_excel(r"..\data\us\\newDataset\Clean US Combine.xlsx", sheet_name='Sheet1')
print(labels['UserStory'][0])
print(labels['Label'][0])
labels_lemma = preProcessing6(labels['UserStory'])
features = convertSentToTFVec(labels_lemma)
print(len(labels_lemma))
print(labels_lemma[0])
print(labels.groupby('Label').size())
# convert Class into Number
labels['Label'] = convertClassToNum(labels['Label'])

i can accept / refuse as soon as possible
Capability
991
i can accept / refuse as soon as possible
Label
Capability    684
Hard-goal      40
Soft-goal     177
Task           90
dtype: int64


In [5]:
def evaluation(labels, preds, target_names):
    metricReport = classification_report(labels, preds, target_names=target_names, zero_division=0, output_dict=True)
    return {
        # 'Accuracy': metricReport['accuracy'],
        'CapP': metricReport[target_names[0]]['precision'],
        'CapR': metricReport[target_names[0]]['recall'],              
        'CapF1': metricReport[target_names[0]]['f1-score'],
        'HGP': metricReport[target_names[1]]['precision'],
        'HGR': metricReport[target_names[1]]['recall'],
        'HGF1': metricReport[target_names[1]]['f1-score'],
        'SGP': metricReport[target_names[2]]['precision'],
        'SGR': metricReport[target_names[2]]['recall'],
        'SGF1': metricReport[target_names[2]]['f1-score'],
        'TP': metricReport[target_names[3]]['precision'],
        'TR': metricReport[target_names[3]]['recall'],
        'TF1': metricReport[target_names[3]]['f1-score'],
    }
    
def average_dicts(dicts):
    result = {}
    counts = {}
    
    # Iterate through each dictionary in the list
    for d in dicts:
        # Iterate through each key-value pair in the dictionary
        for key, value in d.items():
            # Accumulate the values for each key
            result[key] = result.get(key, 0) + value
            # Keep track of the counts for each key
            counts[key] = counts.get(key, 0) + 1
    
    # Calculate the average for each key
    for key in result:
        result[key] /= counts[key]
    
    return result

In [6]:
# Create 5 Folds 
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

svm_linear = svm.SVC(C=10, class_weight='balanced', kernel='linear', random_state=0)
svm_poly = svm.SVC(C=1, class_weight='balanced', degree=2, kernel='poly', random_state=0)
svm_rbf = svm.SVC(C=1, class_weight='balanced', kernel='rbf', random_state=0)
svm_sigmoid = svm.SVC(C=1, class_weight='balanced', kernel='sigmoid', random_state=0)
rf = RandomForestClassifier(class_weight='balanced', max_depth=25, criterion = 'entropy', random_state=0)
nb_Ber = naive_bayes.BernoulliNB(alpha=0.1, binarize=0)
nb_Gau = naive_bayes.GaussianNB(var_smoothing=0)
lr = LogisticRegression(C=1, class_weight='balanced', solver='newton-cholesky', random_state=0)
mlp = MLPClassifier(activation='tanh', hidden_layer_sizes=25,solver='lbfgs', learning_rate='constant', alpha=0.0001, random_state=0)


models = [nb_Gau, nb_Ber, lr, svm_linear, svm_rbf,svm_poly, svm_sigmoid, rf, mlp]
# models = [mlp]
results_df = pd.DataFrame()
for model in models:
    results = []
    
    for train_index, test_index in sss.split(features, labels['Label']):    
        x_vec_train, x_vec_test = features[train_index], features[test_index]
        y_train, y_test = labels['Label'][train_index], labels['Label'][test_index] 
        
        model.fit(x_vec_train.toarray(), y_train)
        y_pred = model.predict(x_vec_test.toarray())
        results.append(evaluation(y_test, y_pred, target_names=target_names))
        
    # print(type(pd.DataFrame(results).mean()))
    if type(model).__name__ == "SVC": 
        name = pd.DataFrame({'modelName': [type(model).__name__ + model.kernel]})
    else: name = pd.DataFrame({'modelName': [type(model).__name__]})
    model_results_df = pd.concat([name.T, pd.DataFrame(results).mean()])
    results_df = pd.concat([results_df, model_results_df], axis=1)
print(results_df)

print('Save to File!')
results_df.to_excel('results/pipeline1_classical_ml_tf_idf.xlsx')
print('Finished!')

                    0            0                   0          0         0  \
modelName  GaussianNB  BernoulliNB  LogisticRegression  SVClinear    SVCrbf   
CapP         0.688442     0.882649            0.895849   0.821625  0.839716   
CapR              1.0     0.789781            0.751825   0.832117  0.870073   
CapF1        0.815476     0.833595            0.817488   0.826565  0.854544   
HGP               0.0     0.094848            0.135714   0.080404  0.166667   
HGR               0.0        0.125               0.225        0.1      0.05   
HGF1              0.0     0.105835            0.169091   0.089026  0.076364   
SGP               0.0     0.516725            0.505804   0.518816  0.568514   
SGR               0.0     0.672222            0.672222   0.516667  0.694444   
SGF1              0.0     0.583109            0.576993   0.515718   0.62415   
TP                0.0     0.475529            0.441225   0.505244  0.580606   
TR                0.0     0.555556            0.6111

In [7]:
# pd.to_excel()
