In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils.validation import column_or_1d
from time import perf_counter
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
simplefilter("ignore", category=ConvergenceWarning)
simplefilter("ignore", category=DataConversionWarning)

In [3]:
datasets = ["a_affirmative", "a_conditional", "a_doubt_question", "a_emphasis", "a_negative", "a_relative", "a_topics", "a_wh_question", "a_yn_question",
         "b_affirmative", "b_conditional", "b_doubt_question", "b_emphasis", "b_negative", "b_relative", "b_topics", "b_wh_question", "b_yn_question"]

paramsMLP = {'a_affirmative': {'alpha': 0.0001, 'learning_rate_init': 0.01, 'RocAucScore': 0.7322321234514235},
          'a_conditional': {'alpha': 0.001, 'learning_rate_init': 0.01, 'RocAucScore': 0.6878647342995169},
          'a_doubt_question': {'alpha': 0.0001, 'learning_rate_init': 0.003, 'RocAucScore': 0.5718023939242163},
          'a_emphasis': {'alpha': 0.01, 'learning_rate_init': 0.01, 'RocAucScore': 0.6144836486716828},
          'a_negative': {'alpha': 0.001, 'learning_rate_init': 0.01, 'RocAucScore': 0.6915968192563937},
          'a_relative': {'alpha': 0.0001, 'learning_rate_init': 0.01, 'RocAucScore': 0.6643406140531629},
          'a_topics': {'alpha': 0.0003, 'learning_rate_init': 0.01, 'RocAucScore': 0.6473959455651511},
          'a_wh_question': {'alpha': 0.001, 'learning_rate_init': 0.003, 'RocAucScore': 0.6318664763325446},
          'a_yn_question': {'alpha': 0.0003, 'learning_rate_init': 0.01, 'RocAucScore': 0.7141722707760443},
          'b_affirmative': {'alpha': 0.003, 'learning_rate_init': 0.01, 'RocAucScore': 0.7595775538857945},
          'b_conditional': {'alpha': 0.01, 'learning_rate_init': 0.01, 'RocAucScore': 0.7658497968334034},
          'b_doubt_question': {'alpha': 0.001, 'learning_rate_init': 0.01, 'RocAucScore': 0.7811996118516573},
          'b_emphasis': {'alpha': 0.0001, 'learning_rate_init': 0.003, 'RocAucScore': 0.7074909383809908},
          'b_negative': {'alpha': 0.0003, 'learning_rate_init': 0.01, 'RocAucScore': 0.7372542293019431},
          'b_relative': {'alpha': 0.0001, 'learning_rate_init': 0.01, 'RocAucScore': 0.8059416272414742},
          'b_topics': {'alpha': 0.001, 'learning_rate_init': 0.01, 'RocAucScore': 0.6770052961001445},
          'b_wh_question': {'alpha': 0.0001, 'learning_rate_init': 0.01, 'RocAucScore': 0.8151934951139722},
          'b_yn_question': {'alpha': 0.01, 'learning_rate_init': 0.003, 'RocAucScore': 0.7234257738267788}}

paramsBag = {'a_affirmative'  : {'estimators': 50, 'RocAucScore': 0.7802962088986867},
'a_conditional'  : {'estimators': 50, 'RocAucScore': 0.7332382014121144},
'a_doubt_question'  : {'estimators': 10, 'RocAucScore': 0.6248530629350615},
'a_emphasis'  : {'estimators': 50, 'RocAucScore': 0.6687425426741666},
'a_negative'  : {'estimators': 25, 'RocAucScore': 0.7044648613797551},
'a_relative'  : {'estimators': 50, 'RocAucScore': 0.7105254481763857},
'a_topics'  : {'estimators': 10, 'RocAucScore': 0.7673299739760315},
'a_wh_question'  : {'estimators': 50, 'RocAucScore': 0.6482704456880926},
'a_yn_question'  : {'estimators': 100, 'RocAucScore': 0.7432410356938658},
'b_affirmative'  : {'estimators': 100, 'RocAucScore': 0.7744665422060389},
'b_conditional'  : {'estimators': 50, 'RocAucScore': 0.820222782681799},
'b_doubt_question'  : {'estimators': 25, 'RocAucScore': 0.8274344999671679},
'b_emphasis'  : {'estimators': 50, 'RocAucScore': 0.7621224325412808},
'b_negative'  : {'estimators': 100, 'RocAucScore': 0.8080849621605088},
'b_relative'  : {'estimators': 10, 'RocAucScore': 0.8524036030037836},
'b_topics'  : {'estimators': 100, 'RocAucScore': 0.7710287273310865},
'b_wh_question'  : {'estimators': 25, 'RocAucScore': 0.8498059263466818},
'b_yn_question'  : {'estimators': 100, 'RocAucScore': 0.7652011466486295}}

In [4]:
accDefaultMLP = {}
accDefaultBagging = {}
accPreMLP = {}
accPreBagging = {}
timePreMLP = {}
timePreBag = {}

In [16]:
def trainDefault(name):
    X_train = pd.read_csv("./SplitData/" + name + "_X_train.csv")
    X_test = pd.read_csv("./SplitData/" + name + "_X_test.csv")
    y_train = pd.read_csv("./SplitData/" + name + "_y_train.csv")
    y_train = column_or_1d(y_train, warn=False)
    y_test = pd.read_csv("./SplitData/" + name + "_y_test.csv")
    y_test = column_or_1d(y_test, warn=False)
    mlp = MLPClassifier(random_state=42, hidden_layer_sizes=(30, 30), max_iter=50, activation="relu", solver="adam", alpha=alphas[name], learning_rate_init=lr[name])
    bag = BaggingClassifier(mlp, n_estimators=estimators[name])

    mlp.fit(X_train, y_train)
    bag.fit(X_train, y_train)
    predictMLP = mlp.predict(X_test)
    predictBagging = bag.predict(X_test)

    accDefaultMLP[name] = accuracy_score(y_test, predictMLP)
    accDefaultBagging[name] = accuracy_score(y_test, predictBagging)

def trainPre(name):
    X_train = pd.read_csv("./SplitPreprocessedData/" + name + "_X_train.csv")
    X_test = pd.read_csv("./SplitPreprocessedData/" + name + "_X_test.csv")
    y_train = pd.read_csv("./SplitPreprocessedData/" + name + "_y_train.csv")
    y_train = column_or_1d(y_train, warn=False)
    y_test = pd.read_csv("./SplitPreprocessedData/" + name + "_y_test.csv")
    y_test = column_or_1d(y_test, warn=False)
    seconds = perf_counter()
    mlp = MLPClassifier(random_state=42, hidden_layer_sizes=(30, 30), max_iter=50, activation="relu", solver="adam", alpha=paramsMLP[name]['alpha'], learning_rate_init=paramsMLP[name]['learning_rate_init'])
    bag = BaggingClassifier(mlp, n_estimators=paramsBag[name]['estimators'])

    # mlp.fit(X_train, y_train)
    # predictMLP = mlp.predict(X_test)

    bag.fit(X_train, y_train)
    predictBagging = bag.predict(X_test)

    # accPreMLP[name] = accuracy_score(y_test, predictMLP)
    accPreBagging[name] = accuracy_score(y_test, predictBagging)
    # timePreMLP[name] = perf_counter() - seconds
    timePreBag[name] = perf_counter() - seconds 

In [17]:
for i in datasets:
    # trainDefault(i)
    trainPre(i)

In [18]:
def printDict(d):
    for i in d:
        print(i, " :", d[i])
    print()

In [19]:
# print(accDefaultMLP)
# print(accDefaultBagging)
# printDict(accPreMLP)
printDict(accPreBagging)
# printDict(timePreMLP)
printDict(timePreBag)

a_affirmative  : 0.7366255144032922
a_conditional  : 0.7058823529411765
a_doubt_question  : 0.6644518272425249
a_emphasis  : 0.8306709265175719
a_negative  : 0.627906976744186
a_relative  : 0.7270992366412213
a_topics  : 0.7936893203883495
a_wh_question  : 0.5466666666666666
a_yn_question  : 0.6273291925465838
b_affirmative  : 0.6735537190082644
b_conditional  : 0.7011494252873564
b_doubt_question  : 0.65625
b_emphasis  : 0.6212624584717608
b_negative  : 0.6737804878048781
b_relative  : 0.8166259168704156
b_topics  : 0.7819905213270142
b_wh_question  : 0.7755102040816326
b_yn_question  : 0.726790450928382

a_affirmative  : 12.688171799999964
a_conditional  : 24.17234099999996
a_doubt_question  : 3.4096673999999894
a_emphasis  : 15.869863899999928
a_negative  : 6.621265200000153
a_relative  : 24.439079300000003
a_topics  : 4.013929799999914
a_wh_question  : 15.474587400000019
a_yn_question  : 31.738706000000093
b_affirmative  : 24.581829599999992
b_conditional  : 21.01357810000013
b_dou