In [None]:
import numpy as np
import pandas as pd
import pickle

from joblib import dump, load
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from keras.utils import to_categorical
from random import sample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn import model_selection
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.colors import ListedColormap
import seaborn as sns
import scipy.stats as st
from math import isnan
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import IsolationForest
from sklearn.ensemble import RandomTreesEmbedding
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

import seaborn as sns; sns.set()
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import StratifiedKFold

In [None]:
df = pd.read_csv('train.csv')
df.head(3)

In [None]:
X_train_nan = df.iloc[ : ,1 : - 1]
y = df.iloc[:,-1]
X_train_nan.head(6)

In [None]:
df_test = pd.read_csv('test.csv')
X_test_nan = df_test.iloc[ : ,1 : ]
X_test_nan.head(6)

In [None]:
X_all_nan = pd.DataFrame(np.concatenate((X_train_nan, X_test_nan)))
print(X_all_nan.shape)

In [None]:
y_arr = np.array(y)
[print(len(y_arr[np.where(y_arr == i)])) for i in range(10)]

In [None]:
corr = X_all_nan.corr(method = 'pearson')
f, ax = plt.subplots(figsize=(22, 18))
cmap = sns.diverging_palette(10, 275, as_cmap = True)
sns.heatmap(corr, cmap = cmap, square = True,
            linewidths = 0.5, cbar_kws = {"shrink": 0.5}, ax = ax)

In [None]:
def plot_class(X, y, j):
    X_to_plot = X.iloc[:,j]
    plt.plot(X_to_plot, y, 'b+')
    plt.axvline(x = X_to_plot.quantile(0.1), color = 'r')
    plt.axvline(x = X_to_plot.quantile(0.90), color = 'r')
    plt.show()   

In [None]:
plot_class(X_train_nan, y_arr, 9)

In [None]:
X_all_nan_scaled = (X_all_nan - X_all_nan.mean()) / X_all_nan.std()
X_all_nan_scaled.head(3)

In [None]:
X_all = X_all_nan_scaled.fillna(X_all_nan_scaled.mean())
X_all.head(5)

In [None]:
nan_factors = np.zeros((len(X_all_nan), len(X_all_nan.columns)), dtype = int)
for i in range(len(X_all_nan)):
    for j in range(len(X_all_nan.columns)):
        if isnan(X_all_nan.iloc[i,j]):
            nan_factors[i,j] = int(1)

nan_factors = pd.DataFrame(nan_factors)
nan_factors.head(5)

In [None]:
nan_factors.columns = nan_factors.columns + 110
print(nan_factors.columns[0])

In [None]:
X_all_expanded = pd.concat([X_all, nan_factors], axis = 1)
X_all_expanded.head(6)

In [None]:
X = X_all_expanded.loc[ : 49999, : ]
X_test = X_all_expanded.loc[50000 : , : ]
print(X.shape)
print(X_test.shape)

In [None]:
def hist_class(X, y, i, j):
    X_to_plot = np.array(X.iloc[:,j])
    X_to_plot = X_to_plot[np.where(y == i)]
    plt.hist(X_to_plot)
    plt.show()
hist_class(X, y, 6, 0)

In [None]:
plt.figure(figsize=(32, 12))
sns.heatmap(X.iloc[:,90:110].corr(method='pearson'), vmin=-1, vmax=1, annot=True)

In [None]:
plt.figure(figsize=(32, 12))
sns.pairplot(df[['90', '52', 'label']], hue = 'label')
plt.legend()
plt.show()

In [None]:
from sklearn.feature_selection import mutual_info_classif

discrete_features = X.dtypes == int

def make_mi_scores(X_mi, y, discrete_features):
    mi_scores = mutual_info_classif(X_mi, y, discrete_features = discrete_features)
    mi_scores = pd.Series(mi_scores, name = "MI Scores", index = X_mi.columns)
    mi_scores = mi_scores.sort_values(ascending = False)
    return mi_scores

mi_scores = make_mi_scores(X, y, discrete_features = discrete_features)
mi_scores

In [None]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending = True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores, color = 'steelblue', edgecolor = 'black')
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

plt.figure(dpi = 100, figsize = (10, 50))
plot_mi_scores(mi_scores)

In [None]:
KF = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 10)
KF.get_n_splits(X, y)

In [None]:
Xs_train, ys_train, Xs_val, ys_val = [], [], [], []

for train_index, val_index in KF.split(X, y):
    Xs_train.append(X.iloc[train_index,:])
    ys_train.append(y[train_index])
    Xs_val.append(X.iloc[val_index,:])
    ys_val.append(y[val_index])

In [None]:
GNB = GaussianNB()

RF = RandomForestClassifier(bootstrap = False, \
                             min_samples_split = 2, \
                             n_estimators = 500, n_jobs = - 1, \
                             random_state = 1, \
                             class_weight = 'balanced')

LR = LogisticRegression(penalty = 'l1', n_jobs = - 1, \
                              C = 0.2, dual = False, solver = 'saga', \
                              multi_class = 'multinomial', \
                              random_state = 1, \
                              class_weight = 'balanced')

ADA = AdaBoostClassifier(DecisionTreeClassifier(random_state = 1, class_weight = 'balanced'), 
                         n_estimators = 500, learning_rate = 0.5, random_state = 1)

BC = BaggingClassifier(DecisionTreeClassifier(random_state = 1, class_weight = 'balanced'), 
                       n_estimators = 500, bootstrap = False, n_jobs = - 1, random_state = 1)

ETC = ExtraTreesClassifier(bootstrap = False, \
                             min_samples_split = 2, \
                             n_estimators = 500, n_jobs = - 1, \
                             random_state = 500, class_weight = 'balanced')

GB = GradientBoostingClassifier(n_estimators = 500, random_state = 1, validation_fraction = 0.25)

KNN = KNeighborsClassifier(n_neighbors = 10, weights = 'distance', n_jobs = - 1)

MLP = MLPClassifier(learning_rate_init = 0.0001, hidden_layer_sizes = (50, 20, 1), \
                   validation_fraction = 0.25, random_state = 1, max_iter=500, \
                   early_stopping = True, n_iter_no_change = 50, tol = 0.000001)

In [None]:
SC = StackingClassifier(estimators = [('GNB', GNB), \

                                ('RF', RF), \
                                
                                ('LR', LR), \
                                      
                                ('ADA', ADA), \
                                
                                ('BC', BC), \
                                
                                ('ETC', ETC), \
                                
                                ('GB', GB), \
                                                                
                                ('KNN', KNN), \
                                      
                                ('MLP', MLP)], n_jobs = 1, \
                        final_estimator = RandomForestClassifier(bootstrap = False, \
                                                                 min_samples_split = 2, \
                                                                 n_estimators = 500, \
                                                                 random_state = 1, \
                                                                 class_weight = 'balanced'))

In [None]:
def fit_model(X, y, model, pl):
    print(pl)
    print(model)
    model.fit(X, y)
    return(model)

In [None]:
all_models = [GNB, RF, LR, ADA, BC, ETC, GB, KNN, MLP, SC]

In [None]:
models_1 = [fit_model(Xs_train[0], ys_train[0], m, pl = 0) for m in all_models]
models_2 = [fit_model(Xs_train[1], ys_train[1], m, pl = 1) for m in all_models]
models_3 = [fit_model(Xs_train[2], ys_train[2], m, pl = 2) for m in all_models]
models_4 = [fit_model(Xs_train[3], ys_train[3], m, pl = 3) for m in all_models]
models_5 = [fit_model(Xs_train[4], ys_train[4], m, pl = 4) for m in all_models]

In [None]:
preds = [[], [], [], [], [], [], [], [], []]
models_agg = [models_1, models_2, models_3, models_4, models_5]
for i in range(5):
    preds[0].append(models_agg[i][0].predict_proba(Xs_val[i]))
    preds[1].append(models_agg[i][1].predict_proba(Xs_val[i]))
    preds[2].append(models_agg[i][2].predict_proba(Xs_val[i]))
    preds[3].append(models_agg[i][3].predict_proba(Xs_val[i]))
    preds[4].append(models_agg[i][4].predict_proba(Xs_val[i]))
    preds[5].append(models_agg[i][5].predict_proba(Xs_val[i]))
    preds[6].append(models_agg[i][6].predict_proba(Xs_val[i]))
    preds[7].append(models_agg[i][7].predict_proba(Xs_val[i]))
    preds[8].append(models_agg[i][8].predict_proba(Xs_val[i]))

In [None]:
def predict_y(w):
    prob_pred = preds
    y_true = ys_val
    w_hat = w
    prob_pred = np.array(prob_pred)
    prob_w = np.array([prob_pred[k] * w_hat[k] for k in range(len(prob_pred))])
    prob_agg = np.sum(prob_w, axis = 0)
    y_hat = np.argmax(prob_agg, axis = 2)
    all_score = np.array([metrics.accuracy_score(y_true[k], y_hat[k]) for k in range(len(y_true))])
    av_score = np.mean(all_score, 0)
    return av_score

In [None]:
problem_dict1 = {
    "obj_func": predict_y,
    "lb": [- 10, ] * 9,
    "ub": [10, ] * 9,
    "minmax": "max",
    "verbose": True,
}

In [None]:
from mealpy.bio_based import SMA
from mealpy.evolutionary_based import DE

In [None]:
model1 = DE.JADE(problem_dict1, epoch = 1000, pop_size = 100, miu_f = 0.5, miu_cr = 0.5, pt = 0.1, ap = 0.1)
model1.solve()

In [None]:
models_final = [fit_model(X, y, m, pl = 0) for m in all_models]

In [None]:
weights = model1.solution[0]

In [None]:
preds_train = [m.predict_proba(X) for m in models_final]

In [None]:
probs_train_w = [preds_train[k] * weights[k] for k in range(len(weights))]
probs_train_agg = np.sum(probs_train_w, axis = 0)
y_train_hat = np.argmax(probs_train_agg, axis = 1)

In [None]:
metrics.accuracy_score(y, y_train_hat)

In [None]:
preds_test = [m.predict_proba(X_test) for m in models_final]

In [None]:
probs_test_w = [preds_test[k] * weights[k] for k in range(len(weights))]
probs_test_agg = np.sum(probs_test_w, axis = 0)
y_test_hat = np.argmax(probs_test_agg, axis = 1)

In [None]:
np.savetxt("test_data_w.csv", y_test_hat, delimiter=",")