## Import dependencies

In [185]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors

## Tokenizing results

In [158]:
classifier_results = pd.read_csv("data/results/classifiers-results.csv")
classifier_results.head()

Unnamed: 0,target,RFC,MLP,SVC,LinearSVC,NuSVC,SGD,LR,KNC,LSTM,BERT
0,advocate,advocate,advocate,advocate,advocate,advocate,advocate,advocate,advocate,advocate,advocate
1,pilot,pilot,pilot,pilot,pilot,pilot,pilot,pilot,pilot,pilot,pilot
2,machine learn engineer,machine learn engineer,machine learn engineer,machine learn engineer,machine learn engineer,machine learn engineer,machine learn engineer,machine learn engineer,machine learn engineer,machine learn engineer,machine learn engineer
3,data scientist,data scientist,data scientist,data scientist,data scientist,data scientist,data scientist,data scientist,data scientist,data scientist,data scientist
4,construction manager,construction manager,construction manager,construction manager,construction manager,construction manager,construction manager,construction manager,construction manager,construction manager,construction manager


In [159]:
from sklearn.preprocessing import LabelEncoder

def encode_labels(data):
    label_encoder = LabelEncoder()
    label_encoder.fit(data)
    return label_encoder

def decoded_label(label_encoder, y_encoded, categorical=True):
    label = [int(np.argmax(y)) for y in y_encoded] if categorical else y_encoded
    return label_encoder.inverse_transform(label)

In [160]:
all_values = []
for col in classifier_results.columns:
    for v in classifier_results[col].values:
        all_values.append(v)

all_values = pd.DataFrame(all_values, columns=['values'])
label_encoder = encode_labels(all_values['values'])

for col in classifier_results.columns:
    classifier_results[col] = label_encoder.transform(classifier_results[col])

classifier_results.to_csv(f"data/results/classifiers-results-coded.csv", encoding='utf-8', index=False)
classifier_results.head()

Unnamed: 0,target,RFC,MLP,SVC,LinearSVC,NuSVC,SGD,LR,KNC,LSTM,BERT
0,1,1,1,1,1,1,1,1,1,1,1
1,71,71,71,71,71,71,71,71,71,71,71
2,61,61,61,61,61,61,61,61,61,61,61
3,31,31,31,31,31,31,31,31,31,31,31
4,23,23,23,23,23,23,23,23,23,23,23


In [191]:
from sklearn.model_selection import train_test_split

X = classifier_results.drop('target', axis=1)
y = classifier_results['target']
X_train, X_test, y_train, y_test = train_test_split(np.array(X), np.array(y), test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1915, 10), (821, 10), (1915,), (821,))

## Decision methods

### Statistic decision

In [192]:
from scipy import stats

statistic_predictions = []
for x in X_test:
    if np.std(x) == 0:
        statistic_predictions.append(x[0])
    else:
        mode = stats.mode(x)
        if mode.count > 5:
            statistic_predictions.append(mode.mode)
        else:
            mean = np.mean(x)
            nearest = len(label_encoder.classes_)
            for v in x:
                diff = abs(v - mean)
                nearest = nearest if nearest < diff else v
            statistic_predictions.append(nearest)

accuracy = accuracy_score(y_test, statistic_predictions)
accuracy

0.9732034104750305

### Dynamic choise

KNN with weighted combination

In [199]:
knn = NearestNeighbors(n_neighbors=3, metric='euclidean')
knn.fit(X_train)
dynamic_predictions = []
for t in X_test:
    distances, indexes = knn.kneighbors(t.reshape(1, -1))
    indexes = np.ravel(indexes)

    x_nn = X_train[indexes]
    y_nn = y_train[indexes]

    weight = np.linalg.pinv(x_nn).dot(y_kn)
    dynamic_predictions.append(int(t.dot(weight)))

accuracy = accuracy_score(y_test, dynamic_predictions)
accuracy

0.03654080389768575

KNN with distance weighted combination

In [232]:
from sklearn.neighbors import NearestNeighbors
import joblib

knn = NearestNeighbors(n_neighbors=1, metric='euclidean')
knn.fit(X_train)
dynamic_predictions = []
for t in X_test:
    distances, indexes = knn.kneighbors(t.reshape(1, -1))
    distances = np.ravel(distances)
    indexes = np.ravel(indexes)
    x_nn = X_train[indexes]

    options = list(set(np.concatenate((np.ravel(x_nn), t)).tolist()))
    best_option = options[0]
    options_weight = {o: 0 for o in options}
    for o in options_weight.keys():
        options_weight[o] += float(np.count_nonzero(t == o))
        for i, x in enumerate(x_nn):
            d = distances[i] if distances[i] != 0 else 1
            options_weight[o] += float(np.count_nonzero(x == o) * 1/d)

        max_option = options_weight[o] if options_weight[o] > best_option else best_option

    dynamic_predictions.append(best_option)

joblib.dump(knn, 'models/KNN_hybrid.pkl')
accuracy = accuracy_score(y_test, dynamic_predictions)
accuracy

0.9317904993909866

### Meta-learning

In [216]:
from sklearn.neighbors import RadiusNeighborsClassifier

rnc = RadiusNeighborsClassifier(weights='distance', radius=100,)
rnc.fit(X_train, y_train)
pred = rnc.predict(X_test)
accuracy = accuracy_score(y_test, pred)
accuracy

0.928136419001218

In [219]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=2, metric='euclidean')
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, pred)
accuracy

0.9305724725943971

In [229]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(32,64,128,64,32), activation='relu', alpha=0.001, solver='adam', max_iter=10000)
mlp.fit(X_train, y_train)
pred = mlp.predict(X_test)
accuracy = accuracy_score(y_test, pred)
accuracy

0.42630937880633374