In [None]:
from copy import deepcopy
import pandas as pd
import numpy as np
import json
from scipy.stats import pearsonr
from sklearn.metrics import r2_score, make_scorer
from sklearn.metrics import mean_absolute_error as mae

from sklearn.linear_model import LogisticRegression
import importlib
from itertools import chain, combinations, product
from sklearn.feature_selection import RFECV

import warnings

warnings.filterwarnings('ignore')

from sklearn.metrics import f1_score

In [None]:
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import roc_auc_score


parameters = {
    'RandomForestClassifier': [
    {
     "n_estimators": [10, 50, 100], "max_depth": [None, 10, 50, 100]
    }
],
'AdaBoostClassifier': [
    {
     "algorithm": ["SAMME.R"]
    }
],
              
'SVC': [
    {
        "kernel": ["rbf"]#, "poly", "rbf", "sigmoid"]
    }
],
              
'DecisionTreeClassifier': [
    {
     "criterion": ["gini", "entropy"], "max_depth": [None, 10, 50, 100]
    }
],
              
              }

models = {
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "SVC": SVC(),
}


def set_params( func, parameters ):
    for parameter, value in parameters.items():
        if hasattr(func, parameter):
            setattr(func, parameter, value)
    return func


def powerset( iterable ):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(1, len(s) + 1))


def eval_nestedCV( X, y, wb, step=1, verbose=0, method='', fsname='', funcname='Ridge', fout=None, fout_mean=None ):
    grids = parameters[funcname]

    wb = wb.lower()
    sortedarg = np.asarray(y.argsort())
    cv = []
    X = np.asarray(X)
    y = np.asarray(y)
    y = np.where(y > 15 , 1, 0)
    for i in range(5):
        for j in range(2):
            if j:
                di = i * 2
                ti = i * 2 + 1
            else:
                di = i * 2 + 1
                ti = i * 2
            dev = sortedarg[di::10]
            test = sortedarg[ti::10]
            train = np.setdiff1d(sortedarg, np.concatenate((dev, test)))
            cv.append([train, dev, test])

    f1_scores, roc_auc_scores = [], []
    f1_scores_dev = []
    test_target_samples, test_predictions = [], []

    for i, [train, dev, test] in enumerate(cv):
        best_params = {}

        best_f1 = 0
        print(i, 'CV fold..', flush=True)

        for grid in grids:
            keys = sorted(grid.keys())
            for comb in product(*[val for k, val in sorted(grid.items())]):
                model = set_params(deepcopy(models[funcname]), dict(zip(keys, comb)))
                if verbose:
                    print('Model:', funcname, 'params:', dict(zip(keys, comb)), 'est:', model)

                fs = RFECV(
                    model, 
                    cv=[[train, dev]], 
                    scoring=make_scorer(f1_score, greater_is_better=True, average="macro"), 
                    step=step
                )
                fs.fit(X, y)
                if verbose:
                    print(fs.n_features_, flush=True)
                f1_score_max = fs.grid_scores_.max()
                print("f1_score_max", f1_score_max, flush=True)

                if best_f1 < f1_score_max:
                    best_f1 = f1_score_max
                    best_params = dict(zip(keys, comb))
                    best_params['fs'] = fs
                    best_params['fs_sup'] = int(sum(fs.get_support() == 1))

        f1_scores_dev.append(best_f1)
        print('Dev: ', best_f1, best_params, flush=True)
        est = set_params(deepcopy(models[funcname]), best_params)
        
        est.fit(best_params['fs'].transform(X[train]), y[train])
        testpred = est.predict(best_params['fs'].transform(X[test]))
        

        cur_f1_score = f1_score(y[test], testpred, average="macro")
        roc_auc_scores.append(roc_auc_score(y[test], testpred))
        f1_scores.append(cur_f1_score)
        test_target_samples.append(y[test])
        test_predictions.append(testpred)

    print('Dev final: ', round(np.mean(f1_scores_dev), 4), flush=True)
    print('Test: ', [round(np.mean(x), 4) for x in [f1_scores, roc_auc_scores]], flush=True)
    return test_target_samples, test_predictions


wb = "WHO"
funcname = "AdaBoostClassifier"  # Function
method = "RFECV"  # F_ANOVA / RFECV

assert funcname in set(models.keys())
assert wb in {'WHO', 'Diener'}
assert method in {'F_ANOVA', 'RFECV'}

print('Function:', funcname)
print('WB:', wb)
print('method:', method)

g1 = pd.read_csv('traincorr1.csv', index_col=[0, 1, 2])
g2 = pd.read_csv('testcorr1.csv', index_col=[0, 1, 2])

g = pd.concat([g1, g2])

wbs = ['Diener', 'WHO']
feats = ['clusters', 'words', 'Meta', 'AppCats', 'RuLIWC']
indeces = [[0, 1], [0, 1], [0, 1, 2], [0, 1, 2], [0, 1, 2]]

appdata1 = pd.read_csv('AppCatsBy3HoursNorm-1.csv', index_col=0)
print(appdata1.shape)

data = {}

cldata = pd.read_csv('besth-clusters_' + wb + '.csv', index_col=[0, 1])
data['clusters'] = g.join(cldata)[cldata.columns]
data['AppCats'] = g.join(appdata1, lsuffix='mail')[appdata1.columns]

wdf1 = pd.read_csv('RuLIWC-matrix-1year-traintest.csv', index_col=[0, 1, 2])
data['RuLIWC'] = g.join(wdf1, rsuffix='APP_')[wdf1.columns]

cols = g.dropna(axis=1).columns
cols = cols[2:]
cols = cols.drop(['MessNewest', 'MessOldest'])
data['Behavior'] = g[cols]

data['Words'] = pd.read_csv('words_selected.csv', index_col=[0, 1, 2])

rat = {'who': 25., 'diener': 30.}
metrs = ['MAE', 'Pearson', 'R-2']

fs = ('clusters', 'Words')

print('Nested CV, Feature set:')
print(fs, flush=True)
traindata = pd.concat([data[x] for x in fs], axis=1)
print(traindata.shape, g['DF_' + wb.lower() + '_score'].shape, flush=True)
print(funcname)
labels = g.DF_who_score
test_target_samples, test_predictions = eval_nestedCV(traindata, labels, wb, method=method, funcname=funcname, fsname=' + '.join(fs))


In [None]:
from sklearn.metrics import *

scores = {i: [] for i in ["roc_auc_score", "f1_micro", "f1_macro", "f1_weighted", "precision_score", "recall_score"]}
for i, j in zip(test_target_samples, test_predictions):
    print("#"*15)
    print(confusion_matrix(i,j))
    pr = precision_score(i,j)
    print("precision_score", pr )
    scores["precision_score"].append(pr)
    rec = recall_score(i,j)
    print("recall_score", rec)
    scores["recall_score"].append(rec)
    r = roc_auc_score(i, j)
    print("roc_auc_score", r )
    scores["roc_auc_score"].append(r)
    f1_micro = f1_score(i, j, average="micro")
    print("f1_micro", f1_micro)
    scores["f1_micro"].append(f1_micro)
    f1_macro = f1_score(i, j, average="macro")
    print("f1_macro", f1_macro)
    scores["f1_macro"].append(f1_macro)
    f1_weighted = f1_score(i, j, average="weighted")
    print("f1_weighted", f1_weighted)
    scores["f1_weighted"].append(f1_weighted)
    print("#"*15)

for i, j in scores.items():
    print(i, round(np.mean(j), 4))