In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler  
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

In [3]:
# load dataset
df = pd.read_csv('graph_features.csv') 
# drop unused columns
df = df.drop(columns='True')
df = df.dropna()
#print(df['label'].value_counts())

# drop domain rows with unknown labels
index_names = df[df['label'] == -1].index
df.drop(index_names, inplace = True)

# make X, y sets
y = df['label']
X = df.drop(labels=['0', 'label'], axis=1)
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25)

# scale features to be in [0, 1]
scaler = MinMaxScaler()

# train and fit only from training data
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)



Hyperparameter tuning:

In [4]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['none', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 150, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

clf = RandomForestClassifier()
clf_random = RandomizedSearchCV(estimator=clf, param_distributions=random_grid, n_iter=1000, cv=None, verbose=2, random_state=99)
clf.fit(X_train, y_train)

RandomForestClassifier()

Optimal RandomSearch parameters: 
{'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 150, 'bootstrap': True}

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'bootstrap': [True],
    'max_depth': [140, 150, 160, 170],
    'min_samples_leaf': [2, 3, 4],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

clf_tune = RandomForestClassifier()
grid_search = GridSearchCV(estimator = clf_tune, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
best_grid = grid_search.best_estimator_
print(best_grid)

10-fold cross validation for 20 random seeds. Using optimal hyperparameters.

In [None]:
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn import metrics

auc = []
pr = []
rc = []
f1 = []
feats = []
test_score = []
clf= RandomForestClassifier()
for random_state in range(20):
  clf = RandomForestClassifier(n_estimators=100, min_samples_leaf=2, min_samples_split=8, max_features='sqrt', max_depth=150, bootstrap=True, random_state=random_state)

  # AUC
  curr_auc = cross_val_score(clf, X, y, cv=10, scoring='roc_auc')
  # PR
  curr_pr = cross_val_score(clf, X, y, cv=10, scoring='precision')
  # RC
  curr_rc = cross_val_score(clf, X, y, cv=10, scoring='recall')
  # F1 
  curr_f1 = cross_val_score(clf, X, y, cv=10, scoring='f1')

  curr_test_score = cross_validate(clf, X, y, cv=10)
  test_score.append(curr_test_score['test_score'])
  curr_feats = clf.feature_importances_

  auc.append(curr_auc)
  pr.append(curr_pr)
  rc.append(curr_rc)
  f1.append(curr_f1)
  feats.append(curr_feats)
  print(random_state, "done")
print("Mean AUC:", np.mean(auc))
print("Mean PR", np.mean(pr))
print("Mean RC", np.mean(rc))
print("Mean F1 score", f1)
print("Mean CV Test score", np.mean(test_score))

a = 0
for x in feats:
  a += x[0]
b = 0
for x in feats:
  b += x[1]
c = 0
for x in feats:
  c += x[2]
d = 0
for x in feats:
  d += x[3]
print("Feature informativity: [degree centrality, in-degree centr, out-degree centr, PageRank")
print(a / len(feats), b / len(feats), c / len(feats), d / len(feats))  