In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [2]:
path = 'datasets'
datasets = [
    [f'{path}/dataset1/fully_balanced/train_balanced_668_enriched.csv', f'{path}/dataset1/fully_balanced/test_balanced_840_enriched.csv'],
    [f'{path}/dataset2/fully_balanced/train_balanced_858_enriched.csv', f'{path}/dataset2/fully_balanced/test_balanced_1566_enriched.csv'],
    [f'{path}/dataset3/fully_balanced/train_balanced_1726_enriched.csv', f'{path}/dataset3/fully_balanced/test_balanced_2636_enriched.csv'],
    [f'{path}/dataset4/fully_balanced/train_balanced_3346_enriched.csv', f'{path}/dataset4/fully_balanced/test_balanced_7798_enriched.csv'],
    [f'{path}/dataset5/fully_balanced/train_balanced_5042_enriched.csv', f'{path}/dataset5/fully_balanced/test_balanced_12976_enriched.csv'],
    [f'{path}/dataset6/fully_balanced/train_balanced_5296_enriched.csv', f'{path}/dataset6/fully_balanced/test_balanced_16276_enriched.csv'],
    [f'{path}/dataset7/fully_balanced/train_balanced_6210_enriched.csv', f'{path}/dataset7/fully_balanced/test_balanced_25900_enriched.csv'],
    [f'{path}/dataset8/fully_balanced/train_balanced_8578_enriched.csv', f'{path}/dataset8/fully_balanced/test_balanced_34586_enriched.csv'],
    [f'{path}/dataset9/fully_balanced/train_balanced_13034_enriched.csv', f'{path}/dataset9/fully_balanced/test_balanced_49236_enriched.csv']
]

features_combinations = [
    ['adamic_adar', 'common_neighbors', 'preferential_attachment', 'total_neighbors'],
    ['adamic_adar', 'common_neighbors', 'preferential_attachment', 'total_neighbors', 'similarity_top_5'],
    ['adamic_adar', 'common_neighbors', 'preferential_attachment', 'total_neighbors', 'similarity_top_100'],
    ['adamic_adar', 'common_neighbors', 'preferential_attachment', 'total_neighbors', 'similarity_top_250'],
]

In [3]:
def neural_network():
    return MLPClassifier(solver='adam', hidden_layer_sizes=(100, 50), random_state=0)

def logistic_regression():
    return LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr')

def knn(n_neighbors):
    return KNeighborsClassifier(n_neighbors=n_neighbors, weights='uniform')

def knn_5():
    return knn(5)

def knn_10():
    return knn(10)

def knn_20():
    return knn(20)

def knn_30():
    return knn(30)

def knn_40():
    return knn(40)

def knn_50():
    return knn(50)

def knn_60():
    return knn(60)

def knn_70():
    return knn(70)

def knn_100():
    return knn(100)

def knn_1():
    return knn(1)

def linear_svm():
    return LinearSVC()

def svm():
    return SVC()

def decision_tree():
    return DecisionTreeClassifier(max_depth=5, random_state=0)

def get_normalizer():
    return MinMaxScaler()

def calculate_scores(classifier, selected_features, train_df, test_df):
    normalizer = get_normalizer()
    normalizer.fit(train_df[selected_features])
    train_data = normalizer.transform(train_df[selected_features])
    test_data = normalizer.transform(test_df[selected_features])
    classifier.fit(train_data, train_df['label'])
    predictions = classifier.predict(test_data)
    res = [
        accuracy_score(test_df['label'], predictions),
        precision_score(test_df['label'], predictions),
        recall_score(test_df['label'], predictions)
    ]
    return res

def evaluate_dataset(train_path, test_path, classifier):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    scores = []
    accuracy_scores = []
    for index, feature_columns in enumerate(features_combinations, 1):
        tmp = calculate_scores(classifier(), feature_columns, train_df, test_df)
        accuracy_value = tmp[0]
        accuracy_scores.append(accuracy_value)
        #print('accuracy %s %s' % (accuracy_value, feature_columns))
        scores.append([feature_columns, *tmp])
    global clf_accuracy_scores
    clf_accuracy_scores.append(accuracy_scores)

    return scores

def evaluate_datasets(classifier_fun):
    print(classifier_fun)
    all_scores = []
    for index, dataset in enumerate(datasets, 1):
        #print(f'dataset {index}', '-'*60)
        all_scores.append(evaluate_dataset(dataset[0], dataset[1], classifier_fun))
    return generate_statistics(all_scores)

def print_scores(scores):
    scores = sorted(scores, key=lambda x: x[1], reverse=True)

    for features, accuracy, precision, recall,  in scores:
        print(features)
        print('A:', accuracy, 'P:', precision, 'R:', recall, '\n')
        
def print_statistics(statistics, score):
    print(score, '-'*10)
    statistics = sorted(statistics[score], key=lambda x: x[1], reverse=True)
    for row in statistics:
        print(f'{row[0]}|AVG:{row[1]:.4f}|MIN:{row[3]:.4f}|MAX:{row[2]:.4f}|STD:{row[4]:.4f}')

def generate_statistics(all_scores):
    # all_scores dimensions: dataset, feature combination, evaluation score:(1: accuracy, 2: precision, 3:recall)
    number_of_datasets = len(datasets)
    number_of_features_combinations = len(features_combinations)
    
    accuracy_scores_per_features_combination = []
    for i in range(number_of_features_combinations):
        accuracies = []
        for j in range(number_of_datasets):
            accuracies.append(all_scores[j][i][1])
        accuracies = np.array(accuracies)
        accuracy_std = np.std(accuracies)
        accuracy_max = np.max(accuracies)
        accuracy_min = np.min(accuracies)
        accuracy_average = np.mean(accuracies)
        identifier = '-'.join(features_combinations[i])
        accuracy_scores_per_features_combination.append([
            identifier,
            accuracy_average,
            accuracy_max,
            accuracy_min,
            accuracy_std
        ])

    precision_scores_per_features_combination = []
    for i in range(number_of_features_combinations):
        precisions = []
        for j in range(number_of_datasets):
            precisions.append(all_scores[j][i][2])
        precisions = np.array(precisions)
        precision_std = np.std(precisions)
        precision_max = np.max(precisions)
        precision_min = np.min(precisions)
        precision_average = np.mean(precisions)
        identifier = '-'.join(features_combinations[i])
        precision_scores_per_features_combination.append([
            identifier,
            precision_average,
            precision_max,
            precision_min,
            precision_std
        ])
    
    recall_scores_per_features_combination = []
    for i in range(number_of_features_combinations):
        recalls = []
        for j in range(number_of_datasets):
            recalls.append(all_scores[j][i][3])
        recalls = np.array(recalls)
        recall_std = np.std(recalls)
        recall_max = np.max(recalls)
        recall_min = np.min(recalls)
        recall_average = np.mean(recalls)
        identifier = '-'.join(features_combinations[i])
        recall_scores_per_features_combination.append([
            identifier,
            recall_average,
            recall_max,
            recall_min,
            recall_std
        ])
    
    return {
        'accuracy': accuracy_scores_per_features_combination,
        'precision': precision_scores_per_features_combination,
        'recall': recall_scores_per_features_combination
    }

In [4]:
clf_accuracy_scores = []
statistics = evaluate_datasets(logistic_regression)

print_statistics(statistics, 'accuracy')
print('#'*10)
print_statistics(statistics, 'precision')
print('#'*10)
print_statistics(statistics, 'recall')
print('#'*10)

<function logistic_regression at 0x7f078460f840>
accuracy ----------
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_100|AVG:0.9555|MIN:0.9381|MAX:0.9651|STD:0.0076
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_250|AVG:0.9555|MIN:0.9381|MAX:0.9651|STD:0.0076
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_5|AVG:0.9550|MIN:0.9374|MAX:0.9646|STD:0.0077
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.9549|MIN:0.9381|MAX:0.9646|STD:0.0074
##########
precision ----------
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_100|AVG:0.9573|MIN:0.9074|MAX:0.9814|STD:0.0224
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_250|AVG:0.9573|MIN:0.9074|MAX:0.9814|STD:0.0224
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_5|AVG:0.9563|MIN:0.9063|MAX:0.9798|STD:0.022

In [5]:
clf_accuracy_scores = []
statistics = evaluate_datasets(knn_50)

print_statistics(statistics, 'accuracy')
print('#'*10)
print_statistics(statistics, 'precision')
print('#'*10)
print_statistics(statistics, 'recall')
print('#'*10)

<function knn_50 at 0x7f0740b59620>
accuracy ----------
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_250|AVG:0.9591|MIN:0.9361|MAX:0.9762|STD:0.0113
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_100|AVG:0.9591|MIN:0.9361|MAX:0.9761|STD:0.0113
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_5|AVG:0.9587|MIN:0.9387|MAX:0.9754|STD:0.0110
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.9555|MIN:0.9253|MAX:0.9753|STD:0.0142
##########
precision ----------
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_100|AVG:0.9347|MIN:0.8894|MAX:0.9676|STD:0.0211
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_250|AVG:0.9346|MIN:0.8894|MAX:0.9676|STD:0.0211
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_5|AVG:0.9343|MIN:0.8935|MAX:0.9673|STD:0.0207
adamic_adar

In [6]:
clf_accuracy_scores = []
statistics = evaluate_datasets(linear_svm)

print_statistics(statistics, 'accuracy')
print('#'*10)
print_statistics(statistics, 'precision')
print('#'*10)
print_statistics(statistics, 'recall')
print('#'*10)

<function linear_svm at 0x7f0740b598c8>
accuracy ----------
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_100|AVG:0.9593|MIN:0.9291|MAX:0.9718|STD:0.0128
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_250|AVG:0.9593|MIN:0.9291|MAX:0.9718|STD:0.0128
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_5|AVG:0.9592|MIN:0.9323|MAX:0.9716|STD:0.0117
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.9586|MIN:0.9310|MAX:0.9705|STD:0.0120
##########
precision ----------
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_100|AVG:0.9419|MIN:0.8810|MAX:0.9735|STD:0.0274
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_250|AVG:0.9418|MIN:0.8810|MAX:0.9735|STD:0.0274
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_5|AVG:0.9414|MIN:0.8842|MAX:0.9733|STD:0.0262
adamic_

In [7]:
clf_accuracy_scores = []
statistics = evaluate_datasets(svm)

print_statistics(statistics, 'accuracy')
print('#'*10)
print_statistics(statistics, 'precision')
print('#'*10)
print_statistics(statistics, 'recall')
print('#'*10)

<function svm at 0x7f0740b59950>
accuracy ----------
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_250|AVG:0.9516|MIN:0.8691|MAX:0.9748|STD:0.0302
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_100|AVG:0.9516|MIN:0.8691|MAX:0.9746|STD:0.0302
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_5|AVG:0.9488|MIN:0.8723|MAX:0.9742|STD:0.0291
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.9460|MIN:0.8263|MAX:0.9744|STD:0.0434
##########
precision ----------
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_250|AVG:0.9211|MIN:0.7937|MAX:0.9642|STD:0.0475
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_100|AVG:0.9210|MIN:0.7937|MAX:0.9639|STD:0.0475
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_5|AVG:0.9162|MIN:0.7984|MAX:0.9633|STD:0.0461
adamic_adar-co

In [8]:
clf_accuracy_scores = []
statistics = evaluate_datasets(decision_tree)

print_statistics(statistics, 'accuracy')
print('#'*10)
print_statistics(statistics, 'precision')
print('#'*10)
print_statistics(statistics, 'recall')
print('#'*10)

<function decision_tree at 0x7f0740b59d08>
accuracy ----------
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.9219|MIN:0.8262|MAX:0.9788|STD:0.0574
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_250|AVG:0.9193|MIN:0.8036|MAX:0.9770|STD:0.0624
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_100|AVG:0.9193|MIN:0.8036|MAX:0.9770|STD:0.0624
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_5|AVG:0.9190|MIN:0.8036|MAX:0.9770|STD:0.0628
##########
precision ----------
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.8775|MIN:0.7420|MAX:0.9671|STD:0.0841
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_250|AVG:0.8753|MIN:0.7179|MAX:0.9654|STD:0.0896
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_100|AVG:0.8752|MIN:0.7179|MAX:0.9654|STD:0.0896
adamic_adar-common_ne

In [9]:
clf_accuracy_scores = []
statistics = evaluate_datasets(neural_network)

print_statistics(statistics, 'accuracy')
print('#'*10)
print_statistics(statistics, 'precision')
print('#'*10)
print_statistics(statistics, 'recall')
print('#'*10)

<function neural_network at 0x7f0740bb70d0>




accuracy ----------
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_100|AVG:0.9314|MIN:0.8167|MAX:0.9803|STD:0.0558
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_250|AVG:0.9313|MIN:0.8155|MAX:0.9803|STD:0.0562
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_5|AVG:0.9290|MIN:0.8116|MAX:0.9796|STD:0.0555
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.9278|MIN:0.8008|MAX:0.9816|STD:0.0613
##########
precision ----------
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_100|AVG:0.8924|MIN:0.7326|MAX:0.9691|STD:0.0813
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_250|AVG:0.8923|MIN:0.7313|MAX:0.9691|STD:0.0817
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity_top_5|AVG:0.8885|MIN:0.7272|MAX:0.9681|STD:0.0806
adamic_adar-common_neighbors-preferential_attac