## Import dependencies

In [64]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.neighbors import NearestNeighbors
from scipy import stats

## Tokenizing results

In [2]:
classifier_results = pd.read_csv("data/results/classifiers-results.csv")
classifier_results.head()

Unnamed: 0,target,RFC,MLP,SVC,LinearSVC,NuSVC,SGD,LR,KNC,LSTM,BERT
0,advocate,advocate,advocate,advocate,advocate,advocate,advocate,advocate,advocate,advocate,fitness
1,pilot,pilot,pilot,pilot,pilot,pilot,pilot,pilot,pilot,pilot,pilot
2,machine learn engineer,machine learn engineer,machine learn engineer,machine learn engineer,machine learn engineer,machine learn engineer,machine learn engineer,machine learn engineer,machine learn engineer,machine learn engineer,machine learn engineer
3,data scientist,data scientist,data scientist,data scientist,data scientist,data scientist,data scientist,data scientist,data scientist,data scientist,data scientist
4,construction manager,construction manager,construction manager,construction manager,construction manager,construction manager,construction manager,construction manager,construction manager,construction manager,construction manager


In [46]:
trust_models = pd.read_csv("data/results/trust_model.csv")
trust_models.head()

Unnamed: 0,RFC,MLP,SVC,LinearSVC,NuSVC,SGD,LR,KNC,LSTM,BERT
0,0.966374,0.953947,0.963816,0.964181,0.954313,0.898757,0.96674,0.950658,0.964547,0.959795


In [3]:
from sklearn.preprocessing import LabelEncoder

def encode_labels(data):
    label_encoder = LabelEncoder()
    label_encoder.fit(data)
    return label_encoder

def decoded_label(label_encoder, y_encoded, categorical=True):
    label = [int(np.argmax(y)) for y in y_encoded] if categorical else y_encoded
    return label_encoder.inverse_transform(label)

In [4]:
all_values = []
for col in classifier_results.columns:
    for v in classifier_results[col].values:
        all_values.append(v)

all_values = pd.DataFrame(all_values, columns=['values'])
label_encoder = encode_labels(all_values['values'])

for col in classifier_results.columns:
    classifier_results[col] = label_encoder.transform(classifier_results[col])

classifier_results.to_csv(f"data/results/classifiers-results-coded.csv", encoding='utf-8', index=False)
classifier_results.head()

Unnamed: 0,target,RFC,MLP,SVC,LinearSVC,NuSVC,SGD,LR,KNC,LSTM,BERT
0,1,1,1,1,1,1,1,1,1,1,47
1,71,71,71,71,71,71,71,71,71,71,71
2,61,61,61,61,61,61,61,61,61,61,61
3,31,31,31,31,31,31,31,31,31,31,31
4,23,23,23,23,23,23,23,23,23,23,23


In [47]:
from sklearn.model_selection import train_test_split

trust_models = np.ravel(np.array(trust_models.drop(columns=['BERT'], axis=1)))
X = classifier_results.drop(['target', 'BERT'], axis=1)
y = classifier_results['target']
X_train, X_test, y_train, y_test = train_test_split(np.array(X), np.array(y), test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2188, 9), (548, 9), (2188,), (548,))

## Decision methods

### Statistic decision

In [112]:
def statistic_decision():
    statistic_predictions = []
    for i, x in enumerate(X_test):
        if np.std(x) == 0:
            statistic_predictions.append(x[0])
        else:
            mode = stats.mode(x)
            if mode.count > 7:
                statistic_predictions.append(mode.mode)
            else:
                # mean = np.average(x, weights=trust_models) 0.9708029197080292
                mean = np.mean(x) # 0.9726277372262774
                nearest = len(label_encoder.classes_)
                for v in x:
                    diff = abs(v - mean)
                    nearest = nearest if nearest < diff else v
                statistic_predictions.append(nearest)

    report = classification_report(y_test, statistic_predictions, output_dict=True)
    return {
        'accuracy': report['accuracy'],
        'macro avg': report['macro avg'],
        'weighted avg': report['weighted avg'],
    }

In [113]:
statistic_decision()

{'accuracy': 0.9726277372262774,
 'macro avg': {'precision': 0.9765900327742432,
  'recall': 0.9772886762360445,
  'f1-score': 0.9752009095965537,
  'support': 548.0},
 'weighted avg': {'precision': 0.9755889715515629,
  'recall': 0.9726277372262774,
  'f1-score': 0.9725290581834755,
  'support': 548.0}}

### Dynamic choise

#### KNN with weighted combination

In [114]:
def weighted_combination():
    knn = NearestNeighbors(n_neighbors=1, metric='euclidean')
    knn.fit(X_train)
    dynamic_predictions = []
    for t in X_test:
        distances, indexes = knn.kneighbors(t.reshape(1, -1))
        indexes = np.ravel(indexes)

        x_nn = X_train[indexes]
        y_nn = y_train[indexes]

        weight = np.linalg.pinv(x_nn).dot(y_nn)
        dynamic_predictions.append(int(t.dot(weight)))

    report = classification_report(y_test, dynamic_predictions, output_dict=True)
    return {
        'accuracy': report['accuracy'],
        'macro avg': report['macro avg'],
        'weighted avg': report['weighted avg'],
    }

#### Mix of statistic with weighted combination

In [115]:
def statistic_decision_weighted_combination():
    predictions = []
    knn = NearestNeighbors(n_neighbors=1, metric='euclidean')
    knn.fit(X_train)
    for x in X_test:
        if np.std(x) == 0:
            predictions.append(x[0])
        else:
            mode = stats.mode(x)
            if mode.count > 5:
                predictions.append(mode.mode)
            else:
                distances, indexes = knn.kneighbors(x.reshape(1, -1))
                indexes = np.ravel(indexes)

                x_nn = X_train[indexes]
                y_nn = y_train[indexes]

                weight = np.linalg.pinv(x_nn).dot(y_nn)
                predictions.append(int(x.dot(weight)))

    report = classification_report(y_test, predictions, output_dict=True)
    return {
        'accuracy': report['accuracy'],
        'macro avg': report['macro avg'],
        'weighted avg': report['weighted avg'],
    }

#### KNN with distance weighted combination

In [129]:
def weighted_distance_decision():
    knn = NearestNeighbors(n_neighbors=1, metric='euclidean')
    knn.fit(X_train)
    dynamic_predictions = []
    for t in X_test:
        distances, indexes = knn.kneighbors(t.reshape(1, -1))
        distances = np.ravel(distances)
        indexes = np.ravel(indexes)
        x_nn = X_train[indexes]

        options = list(set(np.concatenate((np.ravel(x_nn), t)).tolist()))
        best_option = options[0]
        options_weight = {o: 0 for o in options}
        for o in options_weight.keys():
            options_weight[o] += float(np.count_nonzero(t == o))
            for i, x in enumerate(x_nn):
                d = distances[i] if distances[i] != 0 else 1
                options_weight[o] += float(np.count_nonzero(x == o) * trust_models[i]/d)

            best_option = options_weight[o] if options_weight[o] > best_option else best_option

        dynamic_predictions.append(int(best_option))

    report = classification_report(y_test, dynamic_predictions, output_dict=True)
    return {
        'accuracy': report['accuracy'],
        'macro avg': report['macro avg'],
        'weighted avg': report['weighted avg'],
    }

#### Mix of statistical and

In [128]:
def statistic_decision_weighted_distance():
    knn = NearestNeighbors(n_neighbors=1, metric='euclidean')
    knn.fit(X_train)
    statistic_predictions = []
    for t in X_test:
        if np.std(t) == 0:
            statistic_predictions.append(int(t[0]))
        else:
            mode = stats.mode(t)
            if mode.count > 7:
                statistic_predictions.append(int(mode.mode))
            else:
                distances, indexes = knn.kneighbors(t.reshape(1, -1))
                distances = np.ravel(distances)
                indexes = np.ravel(indexes)
                x_nn = X_train[indexes]

                options = list(set(np.concatenate((np.ravel(x_nn), t)).tolist()))
                best_option = options[0]
                options_weight = {o: 0 for o in options}
                for o in options_weight.keys():
                    options_weight[o] += float(np.count_nonzero(t == o))
                    for i, x in enumerate(x_nn):
                        d = distances[i] if distances[i] != 0 else 1
                        options_weight[o] += float(np.count_nonzero(x == o) * trust_models[i]/d)

                    best_option = options_weight[o] if options_weight[o] > best_option else best_option

                statistic_predictions.append(int(best_option))

    report = classification_report(y_test, statistic_predictions, output_dict=True)
    return {
        'accuracy': report['accuracy'],
        'macro avg': report['macro avg'],
        'weighted avg': report['weighted avg'],
    }

#### Choosing decision

In [130]:
decisions = {
    'statistic_decision': statistic_decision(),
    'weighted_combination': weighted_combination(),
    'statistic_decision_weighted_combination': statistic_decision_weighted_combination(),
    'weighted_distance_decision': weighted_distance_decision(),
    'statistic_decision_weighted_distance': statistic_decision_weighted_distance()
}
decisions

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


{'statistic_decision': {'accuracy': 0.9726277372262774,
  'macro avg': {'precision': 0.9765900327742432,
   'recall': 0.9772886762360445,
   'f1-score': 0.9752009095965537,
   'support': 548.0},
  'weighted avg': {'precision': 0.9755889715515629,
   'recall': 0.9726277372262774,
   'f1-score': 0.9725290581834755,
   'support': 548.0}},
 'weighted_combination': {'accuracy': 0.7737226277372263,
  'macro avg': {'precision': 0.7069394640447272,
   'recall': 0.7856486203854625,
   'f1-score': 0.7269806736875338,
   'support': 548.0},
  'weighted avg': {'precision': 0.715711402344614,
   'recall': 0.7737226277372263,
   'f1-score': 0.7275791768642212,
   'support': 548.0}},
 'statistic_decision_weighted_combination': {'accuracy': 0.9653284671532847,
  'macro avg': {'precision': 0.9696658312447788,
   'recall': 0.9740568787937209,
   'f1-score': 0.9685340633646066,
   'support': 548.0},
  'weighted avg': {'precision': 0.9701338199513382,
   'recall': 0.9653284671532847,
   'f1-score': 0.96474

### Meta-learning

In [105]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2', tol=0.0001, C=100, solver='newton-cg', max_iter=5000)
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
accuracy = classification_report(y_test, pred)
accuracy

KeyboardInterrupt: 

In [11]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=2, metric='euclidean')
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
accuracy = classification_report(y_test, pred)
accuracy

0.9361313868613139

In [12]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(32,64,128,64,32), activation='relu', alpha=0.001, solver='adam', max_iter=10000)
mlp.fit(X_train, y_train)
pred = mlp.predict(X_test)
accuracy = classification_report(y_test, pred)
accuracy

0.4835766423357664