# Models with ANOVA feature selection

For this part, we applied ANOVA feature selection and picked the first 15 features with the highest F-score. Then we trained the gradient boosting, random forest and xgboost models on the reduced dataset.

## Loading the data

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import random
random.seed(420)


url = 'https://raw.githubusercontent.com/kozaka93/2023Z-AutoML/main/homeworks/homework2/artificial_test.data'
df_test = pd.read_csv(url, header=None, sep =' ')
df_test = df_test.dropna(how='all', axis=1)

url = 'https://raw.githubusercontent.com/kozaka93/2023Z-AutoML/main/homeworks/homework2/artificial_train.data'
X_train = pd.read_csv(url, header=None, sep =' ')
X_train = X_train.dropna(how='all', axis=1)

url = 'https://raw.githubusercontent.com/kozaka93/2023Z-AutoML/main/homeworks/homework2/artificial_train.labels'
y_train = pd.read_csv(url, header=None)

## Loading the models

In [2]:
xgb_results_rs = pd.read_csv('xgb_pipe_results_rs.csv')
best_params_xgb = xgb_results_rs.loc[xgb_results_rs['a_scores']==max(xgb_results_rs['a_scores'])]['params']
best_params_xgb = eval(best_params_xgb.values[0])
best_params_xgb

{'xgb__learning_rate': 0.38721346767512266,
 'xgb__max_depth': 5,
 'xgb__min_child_weight': 0.6010107840065457,
 'xgb__n_estimators': 427}

In [3]:
max(xgb_results_rs['a_scores'])

0.8364999999999998

In [4]:
gbc_results_rs = pd.read_csv('gb_pipe_results_rs.csv')
best_params_gbc = gbc_results_rs.loc[gbc_results_rs['a_scores']==max(gbc_results_rs['a_scores'])]['params']
best_params_gbc = eval(best_params_gbc.values[0])
best_params_gbc

{'gbc__learning_rate': 0.2891592793734936,
 'gbc__max_depth': 9,
 'gbc__max_leaf_nodes': 42,
 'gbc__min_samples_leaf': 4,
 'gbc__n_estimators': 435}

In [5]:
max(gbc_results_rs['a_scores'])

0.858

In [6]:
rfc_results_rs = pd.read_csv('rf_pipe_results_rs.csv')
best_params_rfc = rfc_results_rs.loc[rfc_results_rs['a_scores']==max(rfc_results_rs['a_scores'])]['params']
best_params_rfc = eval(best_params_rfc.values[0])
best_params_rfc

{'rf__bootstrap': False,
 'rf__max_features': 0.3732560346542585,
 'rf__min_samples_split': 0.06145870228764971,
 'rf__n_estimators': 627}

In [7]:
max(rfc_results_rs['a_scores'])

0.7940000000000002

## Label encodning

In [None]:
from sklearn.preprocessing import LabelEncoder

y_train2 = LabelEncoder().fit_transform(y_train)
y_train2

  y = column_or_1d(y, warn=True)


array([0, 0, 0, ..., 0, 1, 1])

## Model fitting and saving

In [None]:
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

In [None]:
rfc = Pipeline([('anova', SelectKBest(score_func=f_classif,k=15)), ('rf', RandomForestClassifier(random_state=21))])
rfc.set_params(**best_params_rfc)
rfc.fit(X_train, y_train2)
rfc_proba = pd.DataFrame(rfc.predict_proba(df_test)[:,0])
# rfc_proba.to_csv("rfc_pipe_proba.csv",index='index')
np.savetxt('probs_rf_pipe.txt', rfc.predict_proba(df_test)[:,1], fmt = '%10.25f')

In [None]:
gbc = Pipeline([('anova', SelectKBest(score_func=f_classif,k=15)), ('gbc', GradientBoostingClassifier(random_state=21))])
gbc.set_params(**best_params_gbc)
gbc.fit(X_train, y_train2)
gbc_proba = pd.DataFrame(gbc.predict_proba(df_test)[:,0])
# gbc_proba.to_csv("gbc_pipe_proba.csv",index='index')
np.savetxt('probs_gb_pipe.txt', rfc.predict_proba(df_test)[:,1], fmt = '%10.25f')

In [None]:
xgb = Pipeline([('anova', SelectKBest(score_func=f_classif,k=15)), ('xgb', XGBClassifier(random_state=21))])
xgb.set_params(**best_params_xgb)
xgb.fit(X_train, y_train2)
xgb_proba = pd.DataFrame(xgb.predict_proba(df_test)[:,0])
# xgb_proba.to_csv("xgb_pipe_proba.csv",index='index')
np.savetxt('probs_xgb_pipe.txt', rfc.predict_proba(df_test)[:,1], fmt = '%10.25f')