# Advanced Machine Learning
## Project 2
***Zuzanna Glinka, Nikola Miszalska, Malwina Wojewoda***

In [1]:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import RFECV, RFE
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.inspection import permutation_importance
from sklearn.svm import SVC
from xgboost import XGBRFClassifier
from lightgbm import LGBMClassifier
from itertools import compress, combinations
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
from boruta import BorutaPy
import optuna
import optuna.visualization as vis
import ast
import warnings

from eda import features_correlations, target_correlations, select_features_with_rfecv
from model_selection import explore_models, grid_search_CV, multiple_cv
from utils import *
np.random.seed(0)
warnings.filterwarnings('ignore') 

In [2]:
X = pd.read_csv('data/x_train.txt', delimiter=' ', header=None)
y = pd.read_csv('data/y_train.txt', header=None)

In [None]:
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.3)

In [None]:
y_train =y_train[0].reset_index(drop = True)
y_val = y_val[0].reset_index(drop = True)

In [None]:
scaler = StandardScaler()
x_train_standardized = scaler.fit_transform(x_train)
x_val_standardized = scaler.transform(x_val)

## EDA

#### Correlations between features

In [None]:
merged_corr_df, linear_correlations, non_linear_correlations = features_correlations(X)
merged_corr_df.head()

In [None]:
fig = go.Figure(data=go.Heatmap(z=non_linear_correlations.values), 
                layout=go.Layout(width=800, height=800))
fig.update_layout(
    title="Correlation heatmap",
    xaxis=dict(title="Feature"),
    yaxis=dict(title="Feature"),
    width=500,
    height=500
)
fig.show()

Highly correlated are variables from 0 to 9 between each other.
Correlated more than other are variables form 0 to 9 with variables form 100 to 109.
Remaining variables are not correlated.

#### Pair plot for highly correlated variables

In [None]:
pairplot = sns.pairplot(X.iloc[:, 0:15])
pairplot.fig.suptitle("Pairplot of features 0-14", fontsize=60, y=1.02)
for ax in pairplot.axes.flatten():
    ax.set_xlabel(ax.get_xlabel(), fontsize=50)
    ax.set_ylabel(ax.get_ylabel(), fontsize=50)
plt.show()

#### Mean and variance fo features

In [None]:
df_variance = pd.DataFrame(X.var().round(1), columns=['variance']).reset_index()
df_mean = pd.DataFrame(X.mean().round(1), columns=['mean']).reset_index()
df_statistics = pd.merge(df_variance, df_mean, on='index').sort_values(by=['mean', 'variance', 'index'])
df_statistics

#### Correlations between features and target variable

In [None]:
target_correlations(X, y[0])

## Feature selection

In [None]:
df = pd.DataFrame(columns=['Method', 'Selected features', 'Importances'])
df.to_csv('feature_selection.csv', index=False)

#### RFECV: Recursive Feature Elimination with Cross-Validation
##### Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=17)
selected_features, feature_importances = select_features_with_rfecv(X, y, model=rf)

Selected features (saved just in case because it's time-consuming): [8, 100, 101, 102, 103, 104, 105]

##### LightGBM

In [None]:
lgbm = LGBMClassifier(n_estimators=100, random_state=17, verbosity=-1)
selected_features, feature_importances = select_features_with_rfecv(X, y, model=lgbm)

Selected features: [0, 100, 101, 102, 103, 104, 105, 403]

##### Logistic Regression

In [None]:
lr = LogisticRegression()  
selected_features, feature_importances = select_features_with_rfecv(X, y, model=lr)

### Selection with boruta

In [None]:
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
#workaroud for outdated package
np.int = np.int32
np.float = np.float64
np.bool= np.bool_

boruta = BorutaPy(estimator=rf, n_estimators='auto', verbose=0, random_state=1)
boruta.fit(np.array(X), np.array(y))
selected_features = X.columns[boruta.support_].tolist()
print('Selected features: ', selected_features)
boruta_importance = [1/i for i in boruta.ranking_]

### Feature importance with RF
#### Gini

In [None]:
rf_gini = RandomForestClassifier(random_state=17, criterion='gini').fit(X, y)
selected_features = X.columns[np.argsort(rf_gini.feature_importances_)[::-1][:10]]
print('Selected features: ', selected_features)
rf_gini_importance = rf_gini.feature_importances_

#### Entropy

In [None]:
rf_entropy = RandomForestClassifier(random_state=17, criterion='entropy').fit(X, y)
selected_features = X.columns[np.argsort(rf_entropy.feature_importances_)[::-1][:10]]
print('Selected features: ', selected_features)
rf_entropy_importance = rf_entropy.feature_importances_

### Ridge -  permutation importance

In [None]:
model = Ridge(alpha=1e-5).fit(x_train_standardized, y_train)
model.score(x_val_standardized, y_val)
r = permutation_importance(model, x_val_standardized, y_val,
                           n_repeats=30,
                           random_state=0)
r_permutation_importance = r.importances_mean
print('Selected features: ', r.importances_mean.argsort()[::-1][:10])

### XGBoost Random Forest - permutation importance

In [None]:
model = XGBRFClassifier().fit(x_train_standardized, y_train)
model.score(x_val_standardized, y_val)
xgb = permutation_importance(model, x_val_standardized, y_val,
                           n_repeats=30,
                           random_state=0)
print('Selected features: ',xgb.importances_mean.argsort()[::-1][:10])
xgb_permutation_importance = xgb.importances_mean

### AdaBoost Random Forest - permutation importance

In [None]:
model = AdaBoostClassifier().fit(x_train_standardized, y_train)
model.score(x_val_standardized, y_val)
ada = permutation_importance(model, x_val_standardized, y_val,
                           n_repeats=30,
                           random_state=0)
print('Selected features: ',ada.importances_mean.argsort()[::-1][:10])
adaboost_permutation_importance = ada.importances_mean

### RFE

In [None]:
etc = RandomForestClassifier(n_estimators=100, criterion='gini')
rfe = RFE(etc)
rfe.fit(x_train_standardized, y_train.to_numpy().ravel())

x_train_dropped = x_train_standardized[:, rfe.support_]

etc = RandomForestClassifier(n_estimators=100, criterion='gini')
rfecv = RFECV(etc, cv=3)
rfecv.fit(x_train_dropped, y_train.to_numpy().ravel())
x_train_final = x_train_dropped[:, rfecv.support_]
sel_f = [i for i, e in  enumerate(rfe.support_) if e ]
selected_columns_rfe = list(compress(sel_f, rfecv.support_))
importance_rfe = [1/len(selected_columns_rfe) if i in selected_columns_rfe else 0 for i in range(500)]
print('Selected features: ',selected_columns_rfe)

### Voting ensemble

In [None]:
weights = np.zeros(500).reshape(-1, 1)
for importance in [boruta_importance ,rf_gini_importance, rf_entropy_importance, r_permutation_importance, xgb_permutation_importance, adaboost_permutation_importance, importance_rfe]:
    ss = MinMaxScaler()
    weights += ss.fit_transform(np.array(importance).reshape(-1, 1))
weights = (weights+importance_rfe)/7 
important_feature = weights.T[0].argsort()[::-1]
weights_sorted = weights[important_feature]

In [None]:
top_20_features = [str(i+1) for i in important_feature[:20]]
top_20_values = weights_sorted.T[0][:20]

plt.figure(figsize=(10, 5))
plt.bar(top_20_features, top_20_values)
plt.xlabel('Feature')
plt.ylabel('Mean importance')
plt.title('Top 20 most important features')
plt.tight_layout()
plt.show()

## Initial model exploration

In [None]:
initial_exploration_results = explore_models(x_train, y_train, x_val, y_val)

In [None]:
df_initial_models = pd.read_csv('results/initial_models_results.csv', header=None)
df_initial_models.rename(columns={0: 'Model', 1: 'Features', 2: 'Score train', 3: 'Score valid'}, inplace=True)
medians = df_initial_models.groupby(['Model'])['Score valid'].median().sort_values(ascending=False)
df_initial_models['Model'] = pd.Categorical(df_initial_models['Model'], categories=medians.index, ordered=True)

In [None]:
plt.figure(figsize=(14, 5))
sns.boxplot(x='Model', y='Score valid', data=df_initial_models, width=0.5)
plt.title('Distribution of custom metric scores across models')
plt.xticks(rotation=60)
plt.xlabel('Model')
plt.ylabel('Custom metric value')
plt.show()

## Testing AutoGluon 

In [None]:
csv_file = 'results_automl.csv'
for features in combinations([100, 102, 105, 103, 101, 104], r = 3):
    m, path = calculate_metric_automl(features, X, y)
    metric = np.mean(m)
    row = {
                'Features': [features],
                'Mean metric': metric,
                'paths': path
            }
    pd.DataFrame(row).to_csv(csv_file, mode='a',header=False, index=False)

## Tuning hyperperameters

In [None]:
X_= X[[100, 101, 102, 103, 104, 105, 8]]

### Grid search

#### QDA

In [None]:
qda_param_grid = {'reg_param': [i/10.0 for i in range(10)]} 
rf_results = grid_search_CV(QuadraticDiscriminantAnalysis, qda_param_grid, X_, y, range(2, 6))

#### SVC

In [None]:
svc_param_grid = {
    'C': [0.001, 0.01, 0.1, 0.5, 1, 10, 20, 50, 100, 150], 
    'gamma': [1, 0.1, 0.01, 0.001, 'scale', 'auto'], 
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'degree': [1, 2, 3, 4, 5],
    'coef0': [0, 0.1, 0.5, 1, 2, 5],
}

svc_results = grid_search_CV(SVC, svc_param_grid, X_, y, range(2, 6))

#### Radius Neighbors Classifier

In [None]:
rnc_param_grid = {
    'radius': [1.8, 2, 3, 5, 10],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 20, 30, 40, 50],
    'p': [1, 2]
}

rnc_results = grid_search_CV(RadiusNeighborsClassifier, rnc_param_grid, X_, y, range(2, 6))

#### Multi-layer Perceptron

In [None]:
mlp_param_grid = {
    'hidden_layer_sizes': [(20,), (50,), (100,), (200,),  (50, 50), (100, 100), (200, 200), (100, 100, 100), (100, 100, 100, 100)],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.01, 0.1],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init': [0.001, 0.01, 0.1]
}
mlp_results = grid_search_CV(MLPClassifier, mlp_param_grid, X_, y, range(2, 6))

#### Gausian Naive Bayes

In [None]:
gnb_param_grid = {}
gnb_results = grid_search_CV(GaussianNB, gnb_param_grid, X_, y, range(2, 6))

### Optuna
#### SVC

In [None]:
def objective(trial):
    C = trial.suggest_loguniform('C', 1e-5, 1e2)
    kernel = trial.suggest_categorical('kernel', ['poly', 'rbf'])
    gamma = trial.suggest_uniform('gamma', 1e-3, 1e1) 
    degree = trial.suggest_int('degree', 2, 5) if kernel == 'poly' else 3
    coef0 = trial.suggest_uniform('coef0', 0.0, 5.0) if kernel == 'poly' else 0.0

    svc = SVC(C=C, kernel=kernel, gamma=gamma, degree=degree, coef0=coef0, probability=True)
    cv = KFold(n_splits=5, shuffle=True, random_state=17)
    
    custom_scores = []
    for train_index, test_index in cv.split(X_, y):
        X_train, X_test = X_.iloc[train_index], X_.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        svc.fit(X_train, y_train)
        y_pred = svc.predict_proba(X_test)
        
        score = custom_score(y_test, y_pred[:, 1], len(X_.columns))
        custom_scores.append(score)
    
    mean_score = np.mean(custom_scores)

    trial.set_user_attr('parameters', svc.get_params())
    trial.set_user_attr('metric', mean_score)
    return mean_score

study = optuna.create_study(direction='maximize')
try_these_parameters_first = [
    {'C': 0.1, 'gamma': 0.05, 'kernel': 'poly', 'degree': 2, 'coef0': 1}, 
    {'C': 0.001, 'gamma': 0.5, 'kernel': 'poly', 'degree': 3, 'coef0': 2},
    {'C': 0.01, 'gamma': 0.1, 'kernel': 'poly', 'degree': 3, 'coef0': 2},
    {'C': 0.1, 'gamma': 1, 'kernel': 'poly', 'degree': 2, 'coef0': 1},
    {'C': 0.01, 'gamma': 1, 'kernel': 'poly', 'degree': 3, 'coef0': 2}
]
study.enqueue_trial(try_these_parameters_first[0])
study.enqueue_trial(try_these_parameters_first[1])
study.enqueue_trial(try_these_parameters_first[2])
study.enqueue_trial(try_these_parameters_first[3])
study.enqueue_trial(try_these_parameters_first[4])
study.optimize(objective, n_trials=10)

print(f"Best Score: {study.best_value}")
print(f"Best Parameters: {study.best_params}")

In [None]:
vis.plot_optimization_history(study)

In [None]:
vis.plot_param_importances(study)

In [None]:
vis.plot_parallel_coordinate(study)

## Testing configurations with high score
#### SVC

In [None]:
def parse_column(s):
    return ast.literal_eval(s) 

In [None]:
df = pd.read_csv('results/mean_results_svc.csv', header=None)
df.rename(columns={0: 'Features', 1: 'Hyperparameters', 2:'Score train', 3: 'Score valid'}, inplace=True)
df['Hyperparameters'] = df['Hyperparameters'].apply(parse_column)
df['Features'] = df['Features'].apply(parse_column)
df['Score valid'] = pd.to_numeric(df['Score valid'])
df = df[df['Score valid'] >= 6900]
config_sets = list(zip(df['Features'], df['Hyperparameters']))

In [None]:
best_configs_results_svc = multiple_cv(X_, y, config_sets, SVC, probability=True)

#### QDA

In [None]:
df = pd.read_csv('results/mean_results_qda.csv', header=None)
df.rename(columns={0: 'Features', 1: 'Hyperparameters', 2:'Score train', 3: 'Score valid'}, inplace=True)
df['Hyperparameters'] = df['Hyperparameters'].apply(parse_column)
df['Features'] = df['Features'].apply(parse_column)
df['Score valid'] = pd.to_numeric(df['Score valid'])
df = df[df['Score valid'] >= 6900]
config_sets = list(zip(df['Features'], df['Hyperparameters']))

In [None]:
best_configs_results_qda = multiple_cv(X_, y, config_sets, QuadraticDiscriminantAnalysis)

## Testing the best configuration for each model
For best configs perform 5 times 5-fold cross-validation and take the mean score.

In [None]:
all_results = {}

In [None]:
svc = SVC(degree=2, kernel='poly', probability=True, random_state=42)
all_results['SVC'] = search(svc, [101,102,105,103], X, y)

In [None]:
qda = QuadraticDiscriminantAnalysis(reg_param=0.5)
all_results['QDA'] = search(qda, [101,102,105,103], X, y)

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(50, 50, 50), alpha=1e-05, learning_rate='adaptive', early_stopping=True, random_state=0)
all_results['MLP'] = search(mlp, [8, 100, 102], X, y)

In [None]:
nb = GaussianNB()
all_results['NB'] = search(nb, [101,102,103,105], X, y)

In [None]:
rb = RadiusNeighborsClassifier(radius=1.9, weights='distance', p=1)
all_results['RB'] = search(rb, [100,101,102], X, y)

### Soft voting ensemble

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
ensemble = VotingClassifier(estimators=[
    ('svc', svc),
    ('gda', qda),
    ('nb', nb)
], voting='soft')

all_results['Voting(SVC, QDA, NB)'] = search(ensemble, [101,102,103,105], X, y)

In [None]:
df = pd.DataFrame(all_results)
median = df.median()
median.sort_values(ascending=False, inplace=True)
df = df[median.index]

plt.figure(figsize=(12, 4))
sns.boxplot(data=df, width=0.5, color='#4C72B0')
plt.xlabel('Models')
plt.ylabel('Scores')
plt.title('Distribution of custom metric scores across best models for 5 round of 5 fold cross validation')
plt.show()

## Altering dataset
#### Testing the approach to minimise false positives explained in the article: https://www.kaggle.com/discussions/general/376229

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(X[[101,102,103,105]], y, test_size=0.2, random_state=200)

for i in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.8]:
    new_model = alter_dataset(x_train, y_train, [101,102,103,105], i)
    evaluate(x_train, x_valid, y_train, y_valid, new_model)

Results are not better with this method.

## Final model evaluation

In [3]:
X_test = pd.read_csv('data/x_test.txt', delimiter=' ', header=None)

In [7]:
svc = SVC(degree=2, kernel='poly', probability=True, random_state=133)
vars = [101, 102, 103, 105]
svc.fit(X[vars], y)

In [8]:
proba_X = svc.predict_proba(X[vars])[:, 1]
score_X = custom_score(y, proba_X, len(vars))
print(f"X score: {score_X}")

X score: 7100.0


In [9]:
predictions = svc.predict_proba(X_test[vars])[:, 1]
top_indices = np.argsort(predictions)[-1000:]+1

In [10]:
np.savetxt('313546_obs.txt', top_indices, fmt='%d', delimiter='\n')
np.savetxt('313546_vars.txt', np.array(vars) + 1, fmt='%d', delimiter='\n')