<div class="alert alert-block alert-info">
    
# TOC<a class="anchor"><a id='toc'></a></b><br>
* [<font color='#E8800A'>Naive Bayes</font>](#first-bullet) <br>
- [<font color='#E8800A'>Logistic Regression</font>](#second-bullet)<br>
- [<font color='#E8800A'>KNN</font>](#third-bullet)<br>
- [<font color='#E8800A'>Support Vector Machines</font>](#fourth-bullet)<br>
- [<font color='#E8800A'>Decision Trees</font>](#fifth-bullet)<br>
- [<font color='#E8800A'>Random forest</font>](#sixth-bullet)<br>
- [<font color='#E8800A'>Boosted Trees</font>](#seventh-bullet)<br> 
- [<font color='#E8800A'>Neural Networks</font>](#eighth-bullet)<br> 
- [<font color='#E8800A'>Ensembles</font>](#ninth-bullet)<br>   
    
</div>

In [None]:
import pandas as pd
import numpy as np
import os
import re



import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.naive_bayes import GaussianNB, ComplementNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC


from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler


from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score,\
RandomizedSearchCV, KFold, StratifiedKFold

from sklearn.neural_network import MLPClassifier

from sklearn import metrics
from sklearn.metrics import classification_report, f1_score, make_scorer

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, \
GradientBoostingClassifier, VotingClassifier, AdaBoostClassifier, StackingClassifier

%pip install boruta
from boruta import BorutaPy

%pip install imblearn
from imblearn.over_sampling import SMOTE

%pip install pandas_profiling
from pandas_profiling import ProfileReport

%pip install openpyxl
import openpyxl


In [None]:
# definitions 
datain_path = 'data/'
src_path = 'src/'


explorations_path = 'explorations/'
submissions_path = 'submissions/'
comparison_path = 'comparison/'

paths = [explorations_path, submissions_path, comparison_path]
for path in paths:
    if not os.path.exists(path): 
        os.makedirs(path)

In [None]:
datasets = {
    'train': 'Train.xlsx',
    'test':'Test.xlsx', 
    'both': {
        'train': 'Train.xlsx',   
        'test':'Test.xlsx',
    } 
}

#datasets = pd.DataFrame(datasets, columns=['name', 'path']).set_index('name')

dataset_name = 'both'


In [None]:
if dataset_name == 'both': 
    data = pd.DataFrame()
    for dataset_path in datasets[dataset_name].values(): 
        tmp = pd.read_excel(os.path.join(datain_path, dataset_path))
        data = pd.concat([data, tmp])

else:    
    dataset_path = datasets[dataset_name]
    data = pd.read_excel(os.path.join(datain_path, dataset_path))
data.head()

In [None]:
data.isna().sum()

# Explorations

## profile report

profile = ProfileReport(
    data,
    title='Raw data',
    minimal=False, 
    correlations={
    "pearson": {"calculate": True},
    "spearman": {"calculate": False},
    "kendall": {"calculate": False},
    "phi_k": {"calculate": False},
    "cramers": {"calculate": False},
    }
)
profile.to_file(os.path.join(explorations_path, 'profile_data_raw.html'))


In [None]:
n_feature=10

def calc_elbowdata(data, base_col): 
    plotdata = data.copy().loc[~data.Income.isna(),:].groupby(base_col).size().sort_values(ascending=False)
    plotdata = pd.DataFrame(plotdata / plotdata.sum()).cumsum().rename(columns={0:'nobs_rel'})
    return plotdata

def get_feature_imp_by_expl(data, base_col, n_feature=n_feature): 
    
    # make sure every combination of levels exist, fill with 0 if no obs
    base = data[base_col].unique()
    Income = [0, 1]
    idx = pd.MultiIndex.from_product(
        [base, Income],
        names=[base_col, 'Income']
    )

    pd1 = pd.DataFrame(index=idx)
    

    a = data.groupby([base_col, 'Income']).size().to_frame().rename(columns={0:'nobs'})
    
    a = pd.concat([pd1, a], axis=1)
    a.loc[a.nobs.isna(), 'nobs'] = 0
    
    a['nobs_rel'] = a.groupby(level=base_col).transform(lambda x: x / (x[0] + x[1]))
    value_cols = a.columns.to_list()
    a.reset_index(inplace=True)
    
    # top Features by nobs: 
    topFeat = data.groupby(base_col).size().to_frame().rename(columns={0:'nobs'})\
        .sort_values('nobs', ascending=False).iloc[0:n_feature,:]\
        .index.to_list()
    
    a.sort_values(['nobs', base_col], ascending=False, inplace=True)
    
    #print('len(topFeat)', len(topFeat))
    #print('len(a)', len(a))
    #print('n_feature', n_feature)
    #print('a[base_col].nunique()', a[base_col].nunique())

    if len(topFeat) < a[base_col].nunique(): 
        print(f'***Features Filtered to top_{n_feature} by nobs!***')
    #print(a[base_col].nunique())

    a = a.loc[a[base_col].isin(topFeat),:]
    return a, value_cols

def plot_feature_imp_by_expl(data, base_col, saveplots): 
    
    a, value_cols = get_feature_imp_by_expl(data, base_col)
    
    value_cols.remove('nobs')
    print(value_cols)
    n_plots = len(value_cols) + 1

    fig, ax = plt.subplots() #, figsize=(20,7)
    #sns.barplot(data=a, x=base_col, y='nobs', hue='Income', ax=ax)#.set_title(col) # [0:10]
    #ax.tick_params(labelrotation=45)
    plotdata = calc_elbowdata(data, base_col)
    sns.lineplot(data=plotdata, y=plotdata.index, x='nobs_rel', ax=ax)
    plt.tight_layout()

    if saveplots: 
        plt.savefig(os.path.join(explorations_path, 'feature_imp_by_expl_abs.png'), dpi=200)
    
    plt.show()


    fig, ax = plt.subplots(ncols = n_plots, gridspec_kw={'width_ratios': [3,1]}) #, figsize=(20,7)
    for i, col in enumerate(value_cols): 
        sns.barplot(data=a, x=base_col, y=col, hue='Income', ax=ax[i])#.set_title(col) # [0:10]
        ax[i].tick_params(labelrotation=45)

    sns.countplot(data=data, x='Income', ax=ax[n_plots-1])
    plt.tight_layout()
    if saveplots: 
        plt.savefig(os.path.join(explorations_path, 'feature_imp_by_expl_rel.png'), dpi=200)
    plt.show()
    
def get_target_ratio(data):  
    a = data.groupby(['Income']).size().to_frame().rename(columns={0:'nobs'})
    a['nobs_rel'] = a.transform(lambda x: x / (x[0] + x[1]))
    value_cols = a.columns.to_list()
    a.reset_index(inplace=True)
    return a.loc[a.Income == 1, 'nobs_rel'].to_list()[0]


def get_feature_imp_by_target_ratio(data, base_col, weighted=False, saveplots=False): 

    target_ratio = get_target_ratio(data)
    target_ratio
                                            
    a, _ = get_feature_imp_by_expl(data, base_col, n_feature=100)
    
    #########
    nObsPerFeatClass =  data.groupby([base_col]).size().to_frame().rename(columns={0:'nobs'})
                           

    ratio_per_level = a.loc[a.Income == 1, [base_col,'nobs_rel']]\
        .set_index(base_col)\
        .rename(columns={'nobs_rel':'class1_ratio'})

    ratio_per_level = pd.concat([ratio_per_level, nObsPerFeatClass], axis=1)
    #min_max_scaler_obs = MinMaxScaler()
    ratio_per_level['nobs_rel'] = ratio_per_level.nobs / sum(ratio_per_level.nobs)


    
    ratio_per_level['diff_to_target'] = ratio_per_level['class1_ratio'] - target_ratio
    ratio_per_level['diff_to_target_dir'] = ['neg' if obs < 0 else 'pos' for obs in ratio_per_level['diff_to_target']]
    
    if weighted: 
        weights = np.power(ratio_per_level['nobs_rel'], 1./3)
    else: 
        weights = 1
        
    ratio_per_level['diff_to_target_abs'] = abs(ratio_per_level['diff_to_target']) * weights
    
    #print(ratio_per_level)

    ratio_per_level.sort_values('diff_to_target_abs', ascending=False, inplace=True)
    ratio_per_level['diff_to_target_abs_cumsum'] = ratio_per_level.diff_to_target_abs.cumsum()
    ratio_per_level


    x = ratio_per_level['diff_to_target_abs_cumsum'].values.reshape(-1, 1) #df.values #returns a numpy array
    min_max_scaler = MinMaxScaler()
    ratio_per_level['diff_to_target_abs_cumsum_scaled'] = min_max_scaler.fit_transform(x)

    print('TargetClass1_ratio', target_ratio)
    diff_to_target_df = ratio_per_level.copy()[['diff_to_target_dir', 'diff_to_target_abs']]
    diff_to_target_df.diff_to_target_abs = round(diff_to_target_df.diff_to_target_abs, 4)
    print(diff_to_target_df)
    if saveplots:
        diff_to_target_df.to_excel(os.path.join(explorations_path, 'diff_to_target_df.xlsx'))

    
    ratio_per_level.index.set_names(base_col, inplace=True)
    return ratio_per_level


def plot_feature_imp_by_target_ratio(data, base_col, weighted=False, saveplots=False): 

    r = get_feature_imp_by_target_ratio(data, base_col, weighted, saveplots=saveplots)

    sns.lineplot(data=r, y=r.index, x='diff_to_target_abs_cumsum_scaled')
    plt.tight_layout()
    
    if saveplots: 
        plt.savefig(os.path.join(explorations_path, 'feature_imp_by_target_ratio.png'), dpi=200)
    plt.show()
    
    
def plot_feature_imp_by_tree(data, base_col, n_feature=n_feature, saveplots=False): 
    # prepare
    onehot = OneHotEncoder()
    X_train_cat = data.loc[:,[base_col]]
    #X_train_cat = data[base_col]

    X_train_onehot = onehot.fit_transform(X_train_cat)
    X_train_onehot_df = pd.DataFrame(X_train_onehot.toarray(), columns=onehot.get_feature_names())
    X_train_onehot_df

    X_train_onehot_df = pd.get_dummies(data[base_col], prefix=base_col)

    # train
    dt_gini = DecisionTreeClassifier(random_state = 1)
    X_train = X_train_onehot_df#.drop(columns=['x0_Africa','x0_Europe', 'x0_Oceania'])
    y_train = data.Income


    dt_gini.fit(X_train, y_train) # data[base_col]
    print('Score:', dt_gini.score(X_train, y_train))

    #dt_gini.feature_importances_
    #tree.plot_tree(dt_gini)

    #plt.barh(onehot.get_feature_names(), dt_gini.feature_importances_)

    #print(dt_gini.feature_importances_)
    sorted_idx = dt_gini.feature_importances_.argsort()#[0:10]
    plotdata = pd.DataFrame({
        'Feature': X_train.columns[sorted_idx], 
        'Importance': dt_gini.feature_importances_[sorted_idx]}).sort_values('Importance', ascending=False)
    #plt.barh()
    #print(plotdata)
    sns.barplot(data=plotdata.iloc[0:n_feature,:], x='Importance', y='Feature')
    plt.xlabel("Feature Importance")
    plt.tight_layout()

    if saveplots: 
        plt.savefig(os.path.join(explorations_path, 'feature_imp_by_tree.png'), dpi=200)
    plt.show()

    
def plot_feature_imp(data, base_col, force_barplot=True, weighted=False, saveplots=False): 
    print('Class distributions')
    if (data[base_col].nunique() < 6) | force_barplot:
        n_plots = 2
        plot_feature_imp_by_expl(data, base_col, saveplots=saveplots)
    else: 
        n_plots = 1
        
    print('\nElbow')        
    plot_feature_imp_by_target_ratio(data, base_col, weighted, saveplots=saveplots)
    
    print('\nDecision Tree')
    plot_feature_imp_by_tree(data, base_col, saveplots=saveplots)
    
def levels_equal(data, base_col):
    train_levels = np.sort(data.loc[~data.Income.isna(), base_col].unique()).tolist()
    test_levels = np.sort(data.loc[data.Income.isna(), base_col].unique()).tolist()
    
    equal = train_levels == test_levels
    if not equal: 
        test_levels_df = pd.DataFrame({
            'test_levels': test_levels
        })

        train_levels_df = pd.DataFrame({
            'train_levels': train_levels
        })

        compare = pd.merge(left=train_levels_df, right=test_levels_df, how='outer', left_on='train_levels', right_on='test_levels')
        print(compare.loc[(compare.train_levels.isna()) | (compare.test_levels.isna()) ])

        print(f'Levels of "{base_col}" differ between test and train set')
        #raise ValueError(f'Levels of "{base_col}" differ between test and train set')
    else: 
        print('Levels ok')
    return None


In [None]:
# 
data.info()

In [None]:
# init 
cols_to_drop = []
cols_to_onehot = []
cols_numeric = []
cols_bool = []



# prep
pred_config = {
    'cardinality': 'original', # low, medium, high, original
    'rmoutlier': True
} 

cardinality = pred_config['cardinality']
print('cardinality:', cardinality)

error_log = {'cleaning': []}


In [None]:
# extract gender from name?!
base_col = 'Name'
target_col = 'male'

salutation = data[base_col].str.split(' ', n=1, expand=True)[0]
if salutation.nunique() != 3: 
    raise ValueError('Unexpected levels of salutation')
    
print(salutation.value_counts())

#gender = ['male' if s == 'Mr.' else 'female' for s in salutation]
#data['gender'] = gender

male = [1 if s == 'Mr.' else 0 if s in ['Mrs.', 'Miss'] else np.nan for s in salutation]
data[target_col] = male

if data.male.isna().sum() > 0: 
    raise Warning('NAs instroduced')


sns.countplot(data=data, hue=data.Income, x=target_col)#.set_title(col)
plt.show()
    
cols_to_drop.append(base_col)
cols_bool.append(target_col)


In [None]:
# Compute age from Birthday

# clean whitespaces
data.Birthday = data.Birthday.str.replace(' ', '')
# define date format
dob_format = '%B%d,%Y'

# transform Birthday to datetime, catching the leap year error 

## helper fct to subtract one day from datetime if error occurs
def subone(obj):
    val = int(obj.group(0))
    return str(val-1)

## init and loop over dates
dob = []
warn_log = []
for i, d in enumerate(data.Birthday): 
    try: 
        dob.append(datetime.strptime(d, dob_format).date())

    except ValueError as e: 
        if str(e) == 'day is out of range for month': 
            dt = datetime.strptime(re.sub('\d{1,2}', subone, d, count=1), dob_format).date()
            warn_log.append((d, dt))
            dob.append(dt)
        else: 
            raise NotImplementedError('Do not know how to deal with that error!')
            dt = np.nan
            warn_log.append((d, dt))
            dob.append(dt)
        
# add age column 
data['age'] = [np.floor((datetime.strptime('2048-12-31', '%Y-%m-%d').date() - d).days / 365.2425) for d in dob]

sns.histplot(data, x='age')
plt.show()
print('Min age:' , min(data.age))

# drop date col 
cols_to_drop.append('Birthday')
cols_numeric.append('age')


In [None]:
data.info()

In [None]:
data.loc[~data.Income.isna()]

In [None]:

# 'Native Continent' to bin 
base_col = 'Native Continent'
#sns.countplot(data=data, hue=data.Income, x=base_col)#.set_title(col)
#plt.show()

plot_feature_imp(data.loc[~data.Income.isna()], base_col, weighted=False, saveplots=False)
levels_equal(data, base_col)

#low, medium, high, original
try: 
    if cardinality in ['low', 'medium']:
        target_col = 'from_europe_or_asia'
        #data['from_europe'] = [1 if a == 'Europe' else 0 for a in data[base_col]]
        data[target_col] = [1 if a in ['Europe', 'Asia'] else 0 for a in data[base_col]]
        cols_bool.append(target_col)
    elif cardinality == 'original':
        target_col = 'native_continent'
        data[target_col] = data[base_col]
        cols_to_onehot.append(target_col)
    else: 
        raise NotImplementedError(f'Can not interpret cardinality "{cardinality}" for base feature "{base_col}"!')
except Exception as e:
    error_log['cleaning'].append(e)
    raise Warning(e)
    

sns.countplot(data=data, x=target_col, hue='Income')
plt.show()

cols_to_drop.append(base_col)



In [None]:
# Marital Status
base_col = 'Marital Status'
#target_col = 'marital_status'

data[base_col].value_counts()

plot_feature_imp(data.loc[~data.Income.isna()], base_col, weighted=False)
levels_equal(data, base_col)

try: 
    if cardinality == 'low': 
        target_col = 'maritalStatus_married'
        data[target_col] = [1 if a in ['Married', 'Married - Spouse in the Army'] else 0 for a in data[base_col]]
        cols_bool.append(target_col)
        
    elif cardinality == 'medium': 
        target_col = 'maritalStatus'
        mapping = {
            'Married':'Married',
            'Single':'Single',
            'Divorced':'Divorced',
            'Separated':'Separated',
            'Widow':'Widow',
            'Married - Spouse Missing':'SpouseMissing',
            'Married - Spouse in the Army':'Married'
        }

        data[target_col] = data[base_col].map(mapping)
        cols_to_onehot.append(target_col)
        
    elif cardinality == 'original': 
        target_col = 'maritalStatus'
        mapping = {
            'Married':'Married',
            'Single':'Single',
            'Divorced':'Divorced',
            'Separated':'Separated',
            'Widow':'Widow',
            'Married - Spouse Missing':'SpouseMissing',
            'Married - Spouse in the Army':'MarriedArmy'
        }

        data[target_col] = data[base_col].map(mapping)
        cols_to_onehot.append(target_col)

    else: 
        raise NotImplementedError(f'Can not interpret cardinality "{cardinality}" for base feature "{base_col}"!')
except Exception as e:
    error_log['cleaning'].append(e)
    raise Warning(e)
    
    
#sns.countplot(data=data, x=target_col)
sns.countplot(data=data, x=target_col, hue='Income')
plt.show()

cols_to_drop.append(base_col)


In [None]:
# Lives with
base_col = 'Lives with'
print(data[base_col].value_counts())

plot_feature_imp(data.loc[~data.Income.isna()], base_col, weighted=False, saveplots=True)
levels_equal(data, base_col)

try: 
    if(cardinality == 'low'): 
        target_col = 'household_livesWithPartner'
        data[target_col] = [1 if a in ['Wife', 'Husband'] else 0 for a in data[base_col]]
        cols_bool.append(target_col)
    elif(cardinality == 'medium'): 
        target_col = 'household'
        mapping = {
            'Wife': 'Partner',
            'Other Family': 'Family',
            'Children': 'Children',
            'Alone': 'Alone',
            'Husband': 'Partner',
            'Other relatives': 'Family'
        }

        print(mapping)

        data[target_col] = data[base_col].map(mapping)
        cols_to_onehot.append(target_col)
    elif(cardinality == 'original'): 
        target_col = 'household'
        mapping = {
            'Wife': 'Wife',
            'Other Family': 'Family',
            'Children': 'Children',
            'Alone': 'Alone',
            'Husband': 'Husband',
            'Other relatives': 'Other'
        }

        print(mapping)

        data[target_col] = data[base_col].map(mapping)
        cols_to_onehot.append(target_col)
    else: 
        raise NotImplementedError(f'Can not interpret cardinality "{cardinality}" for base feature "{base_col}"!')
except Exception as e:
    error_log['cleaning'].append(e)
    raise Warning(e)

sns.countplot(data=data, x=target_col, hue='Income')
plt.show()

cols_to_drop.append(base_col)


In [None]:

# 'Base Area' to bin 
base_col = 'Base Area'

plot_feature_imp(data.loc[~data.Income.isna()], base_col, weighted=False)
levels_equal(data, base_col)

try: 
    if cardinality == 'low': 
        target_col = 'basearea_fanfoss' # basearea_northbury
        target_val = 'Fanfoss'
        target_col_alt = 'basearea_northbury'
        target_val_alt = 'Northbury'

        print('\nResult:')
        data[target_col] = [1 if a == target_val else 0 for a in data[base_col]]

        print('\nAlternative result:')
        test = data[['Income', base_col]].copy()
        test[target_col_alt] = [1 if a == target_val_alt else 0 for a in test[base_col]]
        sns.countplot(data=test, x=target_col_alt, hue='Income')
        plt.show()
        cols_bool.append(target_col)
    elif cardinality == 'medium':
        target_col = 'basearea'
        data[target_col] = [
            target_val if a == target_val 
            else target_val_alt if a == target_val_alt 
            else 'Rest' for a in data[base_col]]
        cols_to_onehot.append(target_col)

    elif cardinality == 'original':
        target_col = 'basearea'
        data[target_col] = data[base_col]
        cols_to_onehot.append(target_col)
    else: 
        raise NotImplementedError(f'Can not interpret cardinality "{cardinality}" for base feature "{base_col}"!')
except Exception as e:
    error_log['cleaning'].append(e)
    raise Warning(e)

    
sns.countplot(data=data, x=target_col, hue='Income')
plt.show()

cols_to_drop.append(base_col)

In [None]:
# Education Level 
base_col = 'Education Level'
target_col = 'education'
print(data.columns)

plot_feature_imp(data.loc[~data.Income.isna()], base_col, weighted=False)
levels_equal(data, base_col)

edu_mapping = pd.read_csv(os.path.join(src_path, 'edu_mapping.csv'), sep=';')
mapping_options = ['level_0', 'level_1', 'numeric', 'original', 'low']


#low, medium, high, original
try: 
    if cardinality == 'low':
        m_option = mapping_options[4]
        plot_fct = sns.countplot
    elif cardinality == 'medium': 
        m_option = mapping_options[4]
        plot_fct = sns.countplot
    elif cardinality == 'high': 
        m_option = mapping_options[2]
        plot_fct = sns.histplot
    elif cardinality == 'original':
        m_option = mapping_options[3]
        plot_fct = sns.countplot       
    else: 
        raise NotImplementedError(f'Can not interpret cardinality "{cardinality}" for base feature "{base_col}"!')
except Exception as e:
    error_log['cleaning'].append(e)
    raise Warning(e)
    


#print(data[base_col].value_counts())

#mapping = dict(edu_mapping[['name', mapping_options[2]]].set_index('name'))
#mapping = {k:v for k,v in edu_mapping[['name', mapping_options[2]]].set_index('name').items()}
#mapping = edu_mapping[['name', mapping_options[2]]].set_index('name')
mapping = edu_mapping[['name', m_option]].rename(columns={m_option: target_col})
print(mapping)

# drop if reruning the cell 
if target_col in data.columns: 
    data.drop(columns=[target_col], inplace=True)

data = data.merge(mapping, left_on=base_col, right_on='name', how='left')
data.drop(columns=['name'], inplace=True)  

# plot target col against prediction classes
fig, ax = plt.subplots(figsize=(15,7))
plot_fct(data=data, x=target_col, hue='Income', ax=ax)
#plt.xticks(rotation=45)
plt.show()

cols_to_drop.append(base_col)
cols_to_onehot.append(target_col)


data[[base_col, target_col]]

In [None]:
# years of education 
base_col = 'Years of Education'
target_col = 'education_years'
#data.rename(columns={base_col: target_col}, inplace=True)
data[target_col] = data[base_col]

cols_to_drop.append(base_col)
cols_numeric.append(target_col)
data.head()
sns.histplot(data=data, x=target_col, hue='Income')
plt.show()

In [None]:
# Employment Sector
base_col = 'Employment Sector'
target_col = 'empl_sector'

plot_feature_imp(data.loc[~data.Income.isna()], base_col, weighted=False)
#levels_equal(data, base_col)

print(data[base_col].value_counts())

#low, medium, high, original
try: 
    if cardinality == 'deprecated':
        mapping = {
            'Private Sector - Services ': 'private',
            'Self-Employed (Individual)': 'self',
            'Public Sector - Others': 'public',
            '?': 'unknown',
            'Private Sector - Others': 'private',
            'Self-Employed (Company)': 'self',
            'Public Sector - Government': 'public',
            'Unemployed': 'delete',
            'Never Worked': 'delete'
            }

    elif cardinality in ['low', 'medium', 'original']: 
        mapping = {
            'Private Sector - Services ': 'private_services',
            'Self-Employed (Individual)': 'self_individual',
            'Public Sector - Others': 'public_others',
            '?': 'unknown',
            'Private Sector - Others': 'private_others',
            'Self-Employed (Company)': 'self_company',
            'Public Sector - Government': 'public_gov',
            'Unemployed': 'unemployed',
            'Never Worked': 'unemployed'
            }
    else: 
        raise NotImplementedError(f'Can not interpret cardinality "{cardinality}" for base feature "{base_col}"!')
except Exception as e:
    error_log['cleaning'].append(e)
    raise Warning(e)


print(mapping)
    
data[target_col] = data[base_col].map(mapping)
levels_equal(data, target_col)

fig, ax = plt.subplots(figsize=(15,7))
sns.countplot(data=data, x=target_col, hue='Income', ax=ax)
plt.show()

cols_to_drop.append(base_col)
cols_to_onehot.append(target_col)

In [None]:
# role
base_col = 'Role'
target_col = 'empl_role'

plot_feature_imp(data.loc[~data.Income.isna()], base_col, weighted=False)
levels_equal(data, base_col)

#low, medium, high, original
try: 
    if cardinality in ['low', 'medium']:
        mapping = {
            'Professor': 'Professor',
            'Management': 'Management',
            'Repair & constructions': 'Operational_low',
            'Administratives': 'Operational',
            'Sales': 'Sales',
            'Other services': 'Services',
            'Machine Operators & Inspectors': 'Operational',
            '?': 'unknown',
            'Transports': 'Operational_low',
            'Cleaners & Handlers': 'Cleaners',
            'Agriculture and Fishing': 'Operational',
            'IT': 'IT_Security',
            'Security': 'IT_Security',
            'Household Services': 'Household',
            'Army': 'Operational_low'
        }
    elif cardinality == 'original':       
        mapping = {
            'Professor': 'Professor',
            'Management': 'Management',
            'Repair & constructions': 'Constructions',
            'Administratives': 'Administratives',
            'Sales': 'Sales',
            'Other services': 'Services',
            'Machine Operators & Inspectors': 'Operator',
            '?': 'unknown',
            'Transports': 'Transports',
            'Cleaners & Handlers': 'Cleaners',
            'Agriculture and Fishing': 'Agriculture',
            'IT': 'IT', 
            'Security': 'Security',
            'Household Services': 'Household',
            'Army': 'Army'
        }
    else: 
        raise NotImplementedError(f'Can not interpret cardinality "{cardinality}" for base feature "{base_col}"!')
except Exception as e:
    error_log['cleaning'].append(e)
    raise Warning(e)

print(data[base_col].value_counts())

print(mapping)
    
data[target_col] = data[base_col].map(mapping)


fig, ax = plt.subplots(figsize=(15,7))
sns.countplot(data=data, x=target_col, hue='Income', ax=ax)
plt.show()

cols_to_drop.append(base_col)
cols_to_onehot.append(target_col)

In [None]:
 

# Working Hours per week
base_col = 'Working Hours per week'
target_col = 'working_hrs_week'

data[target_col] = data[base_col]
#data.rename(columns={base_col: target_col}, inplace=True)

#setting working hours per week of those that have never worked or are unemployed to 0
data.loc[data['empl_sector'] == 'unemployed', 'working_hrs_week'] = 0

sns.histplot(data=data, x=target_col, hue='Income', bins=30)
plt.show()

cols_to_drop.append(base_col)
cols_numeric.append(target_col)
data.head()



In [None]:
# Money Received
base_col = 'Money Received'
target_col = 'group_b_received_money'


#low, medium, high, original
try: 
    if cardinality == 'deprecated':
        data[target_col] = [1 if v != 0 else 0 for v in data[base_col]]
        plot_fct = sns.countplot
        cols_bool.append(target_col)
    elif cardinality in ['original', 'low']:
        data[target_col] = data[base_col]
        plot_fct = sns.histplot
        cols_numeric.append(target_col)
    else: 
        raise NotImplementedError(f'Can not interpret cardinality "{cardinality}" for base feature "{base_col}"!')
except Exception as e:
    error_log['cleaning'].append(e)
    raise Warning(e)

cols_to_drop.append(base_col)


plot_fct(data=data, x=target_col, hue='Income')
plt.show()

#data[[base_col, target_col]]



In [None]:
   

# Ticket Price
base_col = 'Ticket Price'
target_col = 'group_c_payed'

#low, medium, high, original
try: 
    if cardinality == 'deprecated':
        data[target_col] = [1 if v != 0 else 0 for v in data[base_col]]
        sns.countplot(data=data, x=target_col, hue='Income')
        cols_bool.append(target_col)
    elif cardinality in ['original', 'low']:
        data[target_col] = data[base_col]
        sns.histplot(data=data, x=target_col, hue='Income', bins = 30)
        cols_numeric.append(target_col)
    else: 
        raise NotImplementedError(f'Can not interpret cardinality "{cardinality}" for base feature "{base_col}"!')
except Exception as e:
    error_log['cleaning'].append(e)
    raise Warning(e)
    

cols_to_drop.append(base_col)


plt.show()

#data[[base_col, target_col]]

# Outliers

In [None]:
if pred_config['rmoutlier'] :
    
    columns_to_treat = ['education_years' , 'age' , 'working_hrs_week']
    
    for base_col in columns_to_treat:    
        q25 = data[data.Income.notnull()][base_col].quantile(.25)
        q75 = data[data.Income.notnull()][base_col].quantile(.75)
        iqr = (q75 - q25)
        
        if base_col in ['education_years' , 'age']:
            upper_lim = q75 + 1.5 * iqr
            lower_lim = q25 - 1.5 * iqr
        else:
            upper_lim = q75 + 5.5 * iqr
            lower_lim = q25 - 5.5 * iqr
     
        
        if base_col == 'working_hrs_week':
            data[base_col] = data[base_col].apply(lambda x: np.nan if x < lower_lim else np.nan if x > upper_lim else x )
            # use only train data to calculate the mean
            m = data[data.Income.notnull()].groupby('empl_sector').mean()[base_col]
            data[base_col] = data.apply(lambda row: m[row['empl_sector']] if pd.isnull(row[base_col]) else row[base_col], axis=1)
            #Unemployed and Never worked when getting the mean where being setted to NaN. Added the next step to fill with 0 again.
            data[base_col] = data[base_col].fillna(0)
        else:
            # replace with lower or upper limit accordingly
            data[base_col] = data[base_col].apply(lambda x: int(lower_lim) if x < lower_lim else int(upper_lim) if x > upper_lim else x )     
              
            
    

In [None]:
# Check for errors
for name, log in error_log.items():
    if len(log) > 0: 
        print(f'{name}:\n {log}')
        raise Warning('Errors occured! See above.')

In [None]:
error_log

In [None]:
# drop cols
#cols_to_drop.append('CITIZEN_ID')
col_with_index = ['CITIZEN_ID']
data.drop(columns=cols_to_drop, inplace=True)


In [None]:
combined_data = data.copy().set_index(col_with_index)


In [None]:
## profile report

create_cleaning_report = False
if create_cleaning_report: 
    profile = ProfileReport(
        data,
        title=f'Cleaned data {dataset_name}' ,
        minimal=False, 
        correlations={
        "pearson": {"calculate": True},
        "spearman": {"calculate": False},
        "kendall": {"calculate": False},
        "phi_k": {"calculate": False},
        "cramers": {"calculate": False},
        }
    )
    profile.to_file(os.path.join(explorations_path, f'profile_data_cleaned_{dataset_name}.html'))


In [None]:
combined_data.shape

In [None]:
test_data = combined_data.copy().loc[combined_data.Income.isna()]
test_data.drop(columns='Income', inplace=True)
test_data.info()

data = combined_data.copy().loc[~combined_data.Income.isna()]
data.Income = data.Income.astype(int)
data.info()




# Explorations

In [None]:
plot_explorations = True

In [None]:
data.isna().sum()

In [None]:
data.info()

In [None]:
print(data.shape[1])
data.shape[1] - (len(cols_numeric) + len(cols_bool) + len(cols_to_onehot))

In [None]:
# check colmuns
expected_ncols = len(cols_numeric) + len(cols_bool)
for cat_col in cols_to_onehot: 
    expected_ncols += data[cat_col].nunique()
    
expected_ncols

In [None]:
print(len(cols_to_drop))
cols_to_drop

In [None]:
# target distribution

sns.countplot(data=data, x='Income')
plt.show()

In [None]:
if plot_explorations: 
    # Prepare figure
    my_dpi = 200
    fig = plt.figure(
        figsize=(
            #10, 8
            1000/my_dpi, 1000/my_dpi
        )
    ) 

    # Obtain correlation matrix. Round the values to 2 decimal cases. Use the DataFrame corr() and round() method.
    corr = np.round(data.corr(method="pearson"), decimals=2)

    # Build annotation matrix (values above |0.5| will appear annotated in the plot)
    mask_annot = np.absolute(corr.values) >= 0.5
    annot = np.where(mask_annot, corr.values, np.full(corr.shape,"")) # Try to understand what this np.where() does

    # Plot heatmap of the correlation matrix
    sns.heatmap(data=corr, annot=annot, cmap=sns.diverging_palette(220, 10, as_cmap=True), 
                fmt='s', vmin=-1, vmax=1, center=0, square=True, linewidths=.5)

    # Layout
    fig.subplots_adjust(top=0.95)
    # fig.suptitle("Correlation Matrix", fontsize=20)

    plt.tight_layout()

    plt.savefig(
        os.path.join(explorations_path, 'correlation_matrix.png')
        ,dpi=my_dpi
    )
    plt.show()

In [None]:
# distributions 
if plot_explorations: 
    plotdata = data.loc[:,cols_numeric + cols_bool]
    ncols = 3
    n_plots = plotdata.shape[1]
    nrows = int(np.ceil(n_plots/ncols))


    my_dpi = 200

    fig, ax = plt.subplots(
        ncols=ncols, nrows=nrows, 
        figsize=(
            # 15,13
            1200/my_dpi, 1000/my_dpi
        )
    )
    col_no = 0
    for i in range(nrows):
        for j in range(ncols): 
            if col_no < n_plots:
                col = plotdata.columns[col_no]
                print(col)
                if data[col].dtype in [np.float, np.int]: 
                    sns.histplot(data=plotdata, hue=data.Income, x=col, ax=ax[i,j], bins=30).set_title(col)
                else : 
                    sns.countplot(data=plotdata, hue=data.Income, x=col, ax=ax[i,j], dodge=True).set_title(col)
                ax[i,j].tick_params(labelrotation=45)
                col_no +=1

    fig.tight_layout()
    
    plt.savefig(os.path.join(explorations_path, 'distributions.png'), dpi=200)
    plt.show()


# Data Preparation

In [None]:
# prep config

prep_config = {
    'overSampling': False, 
    'scale': 'standard', # standard, minmax,
    'featureselection': False #['boruta', False]
   # ,'stratify': 'y' # ['y', 'None']
}

prep_config

# validate config

if type(prep_config['overSampling']) is not bool: 
    raise ValueError()

if prep_config['scale'] not in ['standard', 'minmax']: 
    raise ValueError()
    
if prep_config['featureselection'] not in ['boruta', False]: 
    raise ValueError()
    
    
def merge_two_dicts(x, y):
    z = x.copy()   # start with x's keys and values
    z.update(y)    # modifies z with y's keys and values & returns None
    return z


config_all = merge_two_dicts(pred_config, prep_config)


config_str = '_'.join([f'{k}{str(v).capitalize()}' for k, v in config_all.items()])
print('config_str: ', config_str)

config_all

In [None]:
# split the cleaned data into test and train data 
X_train = data.copy().drop(['Income'], axis=1)#.values
y = data.copy().loc[:,'Income'].values

X_test = test_data.copy()#.values
X_test.shape

print('X_train.shape', X_train.shape)
print('y.shape', y.shape)
print('X_test.shape', X_test.shape)

In [None]:
X_train.info()
X_test.info()

## One hot encode and scale

In [None]:
# https://towardsdatascience.com/feature-selection-with-borutapy-f0ea84c9366

def onehot_scale(X_train, X_test, scaler, verbose=False): 
    ###Creating series for categorical test and train
    X_train_bool = X_train[cols_bool]
    X_test_bool = X_test[cols_bool]

    ###Instantiating One Hot Encoder
    ohe = OneHotEncoder()
    ###Creating series for categorical test and train
    X_train_cat = X_train[cols_to_onehot]
    X_test_cat = X_test[cols_to_onehot]
    ###Fitting encoder to training categorical features and transforming ###test and train
    X_train_ohe = ohe.fit_transform(X_train_cat)
    X_test_ohe = ohe.transform(X_test_cat)

    ###Converting series to dataframes
    columns = ohe.get_feature_names(input_features=X_train_cat.columns)
    X_train_processed = pd.DataFrame(X_train_ohe.todense(), columns=columns, index=X_train_bool.index)
    X_test_processed = pd.DataFrame(X_test_ohe.todense(), columns=columns, index=X_test_bool.index)


    ###Instantiating Standard Scaler
    if scaler == 'standard':        
        ss = StandardScaler()
    elif scaler == 'minmax': 
        ss = MinMaxScaler()
    else: 
        raise ValueError(f'Can not interpret {scaler} as scaler!')

    ###Converting continuous feature values to floats
    X_train_cont = X_train[cols_numeric].astype(float)
    X_test_cont = X_test[cols_numeric].astype(float)
    ###Fitting scaler to training continuous features and transforming ###train and test
    X_train_scaled = ss.fit_transform(X_train_cont)
    X_test_scaled = ss.transform(X_test_cont)


    ###Concatenating scaled and encoded dataframes
    X = pd.concat(
        [
            pd.DataFrame(X_train_scaled, index=X_train_bool.index, columns=X_train_cont.columns), 
            X_train_bool, 
            X_train_processed
        ], axis=1
    )
    X_features = X.columns.to_list()
    X = X.values
    X_test_ = pd.concat([pd.DataFrame(X_test_scaled, index=X_test_bool.index),X_test_bool, X_test_processed], axis=1).values
    
    if verbose: 
        print('X_train_scaled.shape', X_train_scaled.shape)
        print('X_train_bool.shape', X_train_bool.shape)
        print('X_train_processed.shape', X_train_processed.shape)

        print('X.shape', X.shape)
        print('y.shape', y.shape)
        print('X_test.shape', X_test_.shape)
    
    return X, X_test_, X_features

if True: 
    X, X_test_, X_features = onehot_scale(X_train, X_test, scaler=prep_config['scale'])


## featureselection

In [None]:
X.shape

In [None]:
# https://github.com/scikit-learn-contrib/boruta_py
# https://towardsdatascience.com/feature-selection-with-borutapy-f0ea84c9366


# load X and y
# NOTE BorutaPy accepts numpy arrays only, hence the .values attribute
# X = pd.read_csv('examples/test_X.csv', index_col=0).values
# y = pd.read_csv('examples/test_y.csv', header=None, index_col=0).values
# y = y.ravel()

if prep_config['featureselection'] == 'boruta': 

    print('X.shape:', X.shape)

    # define random forest classifier, with utilising all cores and
    # sampling in proportion to y labels
    rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

    # define Boruta feature selection method
    feat_selector = BorutaPy(rf, n_estimators='auto', verbose=0, random_state=1)

    # find all relevant features - 5 features should be selected
    feat_selector.fit(X, y)

    # check selected features - first 5 features are selected
    print(feat_selector.support_)
    # check ranking of features
    print(feat_selector.ranking_)

    # call transform() on X to filter it down to selected features
    X = feat_selector.transform(X)
    X_test_ = feat_selector.transform(X_test_)
    
elif prep_config['featureselection'] == False: 
    pass
else: 
    raise NotImplementedError()

In [None]:
if prep_config['featureselection'] == 'boruta': 
    
    X_metadata = pd.DataFrame({
    'Features': (X_features),
    'support': (feat_selector.support_), 
    'ranking': (feat_selector.ranking_)
    })

    
    print(X_metadata.support.sum())
    print(X_metadata)
    
    featselector_filename = f'{config_str}_featselector.csv'
    X_metadata.to_csv(os.path.join(comparison_path, featselector_filename), sep=';')


## overSampling to cope with class imbalance 

In [None]:

# https://imbalanced-learn.org/stable/generated/imblearn.over_sampling.SMOTE.html
if prep_config['overSampling']: 
    sm = SMOTE(random_state=2, n_jobs=-1, k_neighbors=5, sampling_strategy='auto')
    X, y = sm.fit_sample(X, y)
    sns.countplot(y)
    plt.show()

In [None]:
if False: 
    if prep_config['stratify'] == 'None': 
        stratify = None
    elif prep_config['stratify']:
        stratify = globals()[prep_config['stratify']]

    stratify

In [None]:
## Train test split on updated X
X_t, X_val, y_t, y_val = train_test_split(
    X, y, random_state=42, 
    stratify=y, #stratify, 
    test_size=0.25
)
sns.countplot(y_t)
plt.show()

# Modelling

In [None]:
# define wich models to run
modelling_config = {
    'naive_bayes1': False,
    'naive_bayes2': False,
    'naive_bayes3': False,
    'logistic_regression_1': False,
    'logistic_regression_gs': False,
    'logistic_regression_after_gs': False ,
    'logistic_regression_rs' : False,
    'logistic_regression_after_rs' : False,
    'knn_1': False,
    'knn_gs': False,
    'knn_after_gs': False,
    'bagging_knn': False,
    'svc_1': False,
    'svc_gs': False,
    'svc_after_gs': False,
    'dt': False,
    'random_forest': False, 
    'boosted_tree': False, 
    'boosted_tree_gs': False,
    'boosted_tree_after_gs': False,
    'boosted_tree_after_gs_complete':False,
    'adaboost': False,
    'mlp': False,
    'mlp_gs': False,
    'mlp_after_gs': False, 
    'ensemble1': False,
    'ensemble2': False,
    'stacking1': False,
    'stacking2' : False,
    'stacking3' : True, 
    'stacking3_complete': True
}

model_comparison = [
    'naive_bayes1',
    'naive_bayes2',
    'naive_bayes3',   
    'logistic_regression_1',
    #'logistic_regression_gs',
    'logistic_regression_after_gs',
    #'logistic_regression_rs',
    'logistic_regression_after_rs',
    'knn_1',
    #'knn_gs',
    'knn_after_gs',
    'bagging_knn',
    'svc_1',
    #'svc_gs',
    'svc_after_gs',
    'dt',
    'random_forest', 
    'boosted_tree', 
    #'boosted_tree_gs',
    'boosted_tree_after_gs',
    'adaboost',
    'mlp',
    #'mlp_gs',
    'mlp_after_gs', 
    'ensemble1',
    'ensemble2',
    'stacking1',
    'stacking2',
    'stacking3'
]



In [None]:
cross_validate = True
models_to_run = 'manual'

if models_to_run == 'all_but_gsrs': 
    modelling_config = {m: (True if bool(re.search('_after_gs|_after_rs', m)) else False if bool(re.search('_gs|_rs', m)) else True) for m, v in modelling_config.items()}
elif models_to_run == 'none': 
    modelling_config = {m:False for m in modelling_config.keys()}
elif models_to_run == 'manual': 
    pass
else: 
    raise NotImplementedError()


In [None]:
def save_submission_csv(model, X_test_, test_data, file_no): 
    test_pred = model.predict(X_test_)
    submission_df = pd.DataFrame({'Income':test_pred}, index=test_data.index).reset_index()
    filename = f'Group26_Version{file_no}.csv'
    submission_df.to_csv(os.path.join(submissions_path, filename), index=False)

def train_model(model, params, X_t, y_t, X_val, y_val):
    clf = model(random_state=0, verbose=False, **params)

    clf.fit(X_t, y_t)
    predicted = clf.predict(X_val)

    print(classification_report(y_val, predicted))
    print('f1_micro:', f1_score(y_val, predicted, average='micro')) 
    print('Acc:', clf.score(X_val, y_val))
    return clf


In [None]:
def train_model_simple(model, params, X_t, y_t, X_val, y_val):
    clf = model(**params)

    clf.fit(X_t, y_t)
    predicted = clf.predict(X_val)

    print(classification_report(y_val, predicted))
    print('f1_micro:', f1_score(y_val, predicted, average='micro')) 
    print('Acc:', clf.score(X_val, y_val))
    return clf

# <font color='#E8800A'>Naive Bayes</font> <a class="anchor" id="first-bullet"></a>
  [Back to TOC](#toc)

In [None]:
###Instantiating gaussian naive bayes
naive_bayes1 = GaussianNB()

if modelling_config['naive_bayes1']: 
    ###Fitting naive bayes to train
    naive_bayes1.fit(X_t,y_t)
    ###Predicting on test data
    predicted = naive_bayes1.predict(X_val)
    ###Training Score
    naive_bayes1.score(X_t, y_t)
    ###Test Score
    naive_bayes1.score(X_val,y_val)
    
    print(classification_report(y_val, predicted))
    print('f1_micro:', f1_score(y_val, predicted, average='micro')) 
    # f1_micro: 0.48178571428571426
    # f1_micro: 0.44875

In [None]:

###Instantiating multinomial naive bayes
naive_bayes2 = MultinomialNB()

###used data scaled with minmax here 
if modelling_config['naive_bayes2'] and prep_config['scale'] == 'minmax': 

    ###Fitting naive bayes to train
    naive_bayes2.fit(X_t,y_t)
    ###Predicting on test data
    predicted = naive_bayes2.predict(X_val)
    ###Training Score
    naive_bayes2.score(X_t, y_t)
    ###Test Score
    naive_bayes2.score(X_val,y_val)
    
    print(classification_report(y_val, predicted))
    print('f1_micro:', f1_score(y_val, predicted, average='micro')) 
    # f1_micro: 0.8044642857142859


###Instantiating complement naive bayes
naive_bayes3 = ComplementNB()

if modelling_config['naive_bayes3']: 

    ###Fitting naive bayes to train
    naive_bayes3.fit(X_t,y_t)
    ###Predicting on test data
    predicted = naive_bayes3.predict(X_val)
    ###Training Score
    naive_bayes3.score(X_t, y_t)
    ###Test Score
    naive_bayes3.score(X_val,y_val)
    
    print(classification_report(y_val, predicted))
    print('f1_micro:', f1_score(y_val, predicted, average='micro')) 
    # f1_micro: 0.7673214285714286
    
    if cross_validate: 
        scores = cross_val_score(naive_bayes3, X, y, cv=10, scoring='f1_micro')
        print(f'{scores.mean()} ~ {scores.std()}')
        #0.7610267857142856 ~ 0.005905863622498532



# <font color='#E8800A'>Logistic Regression</font> <a class="anchor" id="second-bullet"></a>
  [Back to TOC](#toc)

In [None]:
 ###Instantiating logistic regression
logistic_regression_1 = LogisticRegression()

if modelling_config['logistic_regression_1']: 
   
    ###Fitting logistic regression to train
    logistic_regression_1.fit(X_t,y_t)
    ###Predicting on test data
    predicted = logistic_regression_1.predict(X_val)
    ###Training Score
    logistic_regression_1.score(X_t,y_t)
    ###Test Score
    logistic_regression_1.score(X_val,y_val)
    
    print(classification_report(y_val, predicted))
    print('f1_micro:', f1_score(y_val, predicted, average='micro')) 
    # f1_micro: 0.8528571428571429

In [None]:
f1w = make_scorer(f1_score, average='micro')

parameter_space = {
        'penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'C': list(range(1,10)),
        'class_weight':[{0:1.0, 1:1.0},{0:1.0, 1:50.0}],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'multi_class':['auto','ovr','multinomial']
        #dual
        #tolerance for stopping criteria
        #max_iter
        #refit
        #intercept_scaling
        #l1_ratios
    }

logistic_regression_gs = GridSearchCV( LogisticRegression(random_state=0, verbose=False) , parameter_space, n_jobs=-1, scoring=f1w, cv=5, verbose=10)

if modelling_config['logistic_regression_gs']:   

    logistic_regression_gss.fit(X_t, y_t)
    print(f'{logistic_regression_gs.scoring}: {logistic_regression_gs.best_score_}')
    # make_scorer(f1_score, average=micro): 0.85


In [None]:
# lr_gs.best_params_
#{'C': 1,
# 'class_weight': {0: 1.0, 1: 1.0},
# 'multi_class': 'auto',
# 'penalty': 'l2',
# 'solver': 'liblinear'}

#logistic_regression_after_gs = LogisticRegression(C=1,class_weight={0: 1.0, 1: 1.0},penalty = 'l2',solver='liblinear',multi_class='auto')

lr_params_short = {
        'C': 1,
        'class_weight': {0: 1.0, 1: 1.0},
        'penalty': 'l2',
        'solver': 'liblinear',
        'multi_class': 'auto'
    }

logistic_regression_after_gs = LogisticRegression(random_state=0, verbose=False, **lr_params_short)


if modelling_config['logistic_regression_after_gs']:     
    # train final model with best_params from grid search 
   # logistic_regression_after_gs  = train_model(
    #    LogisticRegression,
   #     lr_params_short,
    #   lr_gs.best_params_,
    #    X_t, y_t, X_val, y_val
   # )
    
    logistic_regression_after_gs.fit(X_t, y_t)
    predicted = logistic_regression_after_gs.predict(X_val)

    print(classification_report(y_val, predicted))
    print('f1_micro:', f1_score(y_val, predicted, average='micro')) 
    print('Acc:', logistic_regression_after_gs.score(X_val, y_val))
    
    
    # f1_micro: 0.8530357142857142
    if cross_validate:
        scores = cross_val_score(
            logistic_regression_after_gs , X, y, cv=10, scoring='f1_micro'
        )
        print(f'{scores.mean()} ~ {scores.std()}')
        # 0.8510714285714286 ~ 0.008151937293223808

In [None]:
###Randomized Search
logistic_regression_rs = RandomizedSearchCV(estimator=LogisticRegression(random_state=0, verbose=False), param_distributions=parameter_space, n_iter=100)
if modelling_config['logistic_regression_rs']:     
    logistic_regression_rs.fit(X_t, y_t)
    
#logistic_regression_after_rs = RandomizedSearchCV(estimator=LogisticRegression(random_state=0, verbose=False), param_distributions=parameter_space, n_iter=100)
lr_params_short = {
        'C': 6,
        'class_weight': {0: 1.0, 1: 1.0},
        'penalty': 'l2',
        'solver': 'saga',
        'multi_class': 'ovr'
}

logistic_regression_after_rs = LogisticRegression(random_state=0, verbose=False, **lr_params_short)

if modelling_config['logistic_regression_after_rs']: 
    #logistic_regression_after_rs.fit(X_t, y_t)
    # rsearch.best_score_): 0.8498809523809523
    # train final model with best_params from grid search 
   # lr = train_model(
   #     LogisticRegression,
   #     lr_params_short,
    #   rsearch.best_params_ : {'solver': 'saga','penalty': 'l2','multi_class': 'ovr','class_weight': {0: 1.0, 1: 1.0},'C': 6}
   #     X_t, y_t, X_val, y_val
  #  )
    # f1_micro: 0.8528571428571429
    
    logistic_regression_after_rs.fit(X_t, y_t)
    predicted = logistic_regression_after_rs.predict(X_val)

    print(classification_report(y_val, predicted))
    print('f1_micro:', f1_score(y_val, predicted, average='micro')) 
    print('Acc:', logistic_regression_after_rs.score(X_val, y_val))
    
    if cross_validate:
        scores = cross_val_score(
            logistic_regression_after_rs, X, y, cv=10, scoring='f1_micro'
        )
        print(f'{scores.mean()} ~ {scores.std()}')
        # 0.8514285714285714 ~ 0.008329931278350444

# <font color='#E8800A'>KNN</font> <a class="anchor" id="third-bullet"></a>
  [Back to TOC](#toc)

In [None]:
###Instantiating k nearest neighbors 
knn_1 = KNeighborsClassifier()

if modelling_config['knn_1']: 
    
    ###Fitting k nearnest neighbors to train
    knn_1.fit(X_t,y_t)
    ###Predicting on test data
    predicted = knn_1.predict(X_val)
    ###Training Score
    knn_1.score(X_t,y_t)
    ###Test Score
    knn_1.score(X_val,y_val)
    
    print(classification_report(y_val, predicted))
    print('f1_micro:', f1_score(y_val, predicted, average='micro')) 
    # f1_micro: 0.8351785714285714

In [None]:
f1w = make_scorer(f1_score, average='micro')

parameter_space = {
        'n_neighbors': list(range(1,15)),
        'weights': ['uniform','distance'],
        'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
        'leaf_size' : [5,10,15,30,45] 
}

knn_gs = GridSearchCV( KNeighborsClassifier(), parameter_space, n_jobs=-1, scoring=f1w, cv=5, verbose=10)

if modelling_config['knn_gs']: 
    knn_gs.fit(X_t, y_t)
    print(f'{knn_gs.scoring}: {knn_gs.best_score_}')
    # make_scorer(f1_score, average=micro): 0.8419047619047619


In [None]:
# knn_gs.best_params_
#{'algorithm': 'brute',
# 'leaf_size': 15,
# 'n_neighbors': 11,
# 'weights': 'uniform'}
#knn = KNeighborsClassifier(algorithm = 'brute', n_neighbors = 11, weights = 'uniform')

knn_params_short = {
        'algorithm': 'brute',
        'n_neighbors': 11,
        'weights': 'uniform'
}
knn_after_gs = KNeighborsClassifier(**knn_params_short)

if modelling_config['knn_after_gs']:    

    # train final model with best_params from grid search 
   # knn = train_model_simple(
    #    KNeighborsClassifier,
     #   knn_params_short,
    #   knn_gs.best_params_,
    #    X_t, y_t, X_val, y_val
   # )
    
    knn_after_gs.fit(X_t, y_t)
    predicted = knn_after_gs.predict(X_val)

    print(classification_report(y_val, predicted))
    print('f1_micro:', f1_score(y_val, predicted, average='micro')) 
    print('Acc:', knn_after_gs.score(X_val, y_val))
    # f1_micro: 0.8483928571428572

    if cross_validate:
        scores = cross_val_score(
            knn_after_gs, X, y, cv=10, scoring='f1_micro'
        )

        print(f'{scores.mean()} ~ {scores.std()}')
        # 0.8396428571428572 ~ 0.008096988606253283

In [None]:
# bagging knn

###Instantiating k nearest neighbors 
bagging_knn = BaggingClassifier(base_estimator = knn_1, random_state = 0)

if modelling_config['bagging_knn']: 
    
    ###Fitting k nearnest neighbors to train
    bagging_knn.fit(X_t,y_t)
    ###Predicting on test data
    predicted = bagging_knn.predict(X_val)
    ###Training Score
    bagging_knn.score(X_t,y_t)
    ###Test Score
    bagging_knn.score(X_val,y_val)
    
    print(classification_report(y_val, predicted))
    print('f1_micro:', f1_score(y_val, predicted, average='micro')) 
    # f1_micro: 0.8469642857142857

# <font color='#E8800A'>Support Vector Machine</font> <a class="anchor" id="fourth-bullet"></a>
  [Back to TOC](#toc)

In [None]:
###Instantiating support vector classifier
svc_1 = SVC(gamma='scale')

if modelling_config['svc_1']: 
    
    ###Fitting logistic regression to train
    svc_1.fit(X_t,y_t)
    ###Predicting on test data
    predicted = svc_1.predict(X_val)
    ###Training Score
    svc_1.score(X_t,y_t)
    ###Test Score
    svc_1.score(X_val,y_val)
    
    print(classification_report(y_val, predicted))
    print('f1_micro:', f1_score(y_val, predicted, average='micro')) 
    # f1_micro: 0.8625

In [None]:
f1w = make_scorer(f1_score, average='micro')

parameter_space = {
        'C': [0.1,0.5,1,10,100],
        'kernel': ['linear','poly','rbf','sigmoid'],
        'gamma': ['scale', 'auto']
}

svc_gs = GridSearchCV( SVC(), parameter_space, n_jobs=-1, scoring=f1w, cv=5, verbose=10)

if modelling_config['svc_gs']: 
    
    svc_gs.fit(X_t, y_t)
    print(f'{svc_gs.scoring}: {svc_gs.best_score_}')
    # make_scorer(f1_score, average=micro): 0.85375

In [None]:
# svc_gs.best_params_
#{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}

svc_params_short = {
        'C': 1,
        'gamma': 'scale',
        'kernel': 'rbf'
}

svc_after_gs = SVC(random_state=0, verbose=False, probability = True , **svc_params_short)


if modelling_config['svc_after_gs']: 
    
    svc_after_gs.fit(X_t, y_t)
    predicted = svc_after_gs.predict(X_val)

    print(classification_report(y_val, predicted))
    print('f1_micro:', f1_score(y_val, predicted, average='micro')) 
    print('Acc:', svc_after_gs.score(X_val, y_val))


    # train final model with best_params from grid search 
    #svc = train_model(
    #    SVC,
    #    svc_params_short,
    #   svc_gs.best_params_,
    #    X_t, y_t, X_val, y_val
    # )
    # f1_micro:0.8625

    if cross_validate:
        scores = cross_val_score(
            svc_after_gs, X, y, cv=10, scoring='f1_micro'
        )
        print(f'{scores.mean()} ~ {scores.std()}')
        # 0.8551785714285713 ~ 0.012511474325430974

# <font color='#E8800A'>Decision Trees</font> <a class="anchor" id="fifth-bullet"></a>
  [Back to TOC](#toc)

In [None]:
###Instantiating Decision Tree Classifier
dt = DecisionTreeClassifier()

if modelling_config['dt']: 
    ###Fitting Decision Tree Classifier to train and test
    dt.fit(X_t, y_t)
    ###Predicting on test data
    predicted = dt.predict(X_val)
    ###Test Score
    dt.score(X_val, y_val)
    ###Training Score
    dt.score(X_t, y_t)

    print(classification_report(y_val, predicted))
    print('f1_micro:', f1_score(y_val, predicted, average='micro')) 
    # f1_micro: 0.8139285714285716

# <font color='#E8800A'>Random Forest</font> <a class="anchor" id="sixth-bullet"></a>
  [Back to TOC](#toc)

In [None]:
random_forest = RandomForestClassifier(n_jobs=-1, n_estimators=500, oob_score=True, max_depth=6, random_state=42)

if modelling_config['random_forest']: 
    ###Instantiating Random Forest Classifier
    ###Fitting Random Forest Classifier to train and test
    random_forest.fit(X_t, y_t)
    ###Predicting on test data
    predicted = random_forest.predict(X_val)
    ###Test Score
    random_forest.score(X_val, y_val)
    ###Training Score
    random_forest.score(X_t, y_t)

    print(classification_report(y_val, predicted))
    print('f1_micro:', f1_score(y_val, predicted, average='micro')) 
    # f1_micro: 0.8482142857142857


# <font color='#E8800A'>Boosted Trees</font> <a class="anchor" id="seventh-bullet"></a>
  [Back to TOC](#toc)

In [None]:
boosted_tree = GradientBoostingClassifier(
    learning_rate=.1,
    random_state=0, verbose=False,
    subsample=1,
    n_estimators=1000, max_depth=7, min_impurity_decrease = 0.1,
    max_features='auto',
    n_iter_no_change=30, validation_fraction=0.1,
    warm_start=False
)

if modelling_config['boosted_tree']: 
    boosted_tree.fit(X_t, y_t)
    predicted = boosted_tree.predict(X_val)

    print(classification_report(y_val, predicted))
    print('f1_micro:', f1_score(y_val, predicted, average='micro')) 
    # f1_micro: 0.8755357142857143
    print('Acc:', boosted_tree.score(X_val, y_val))

    save_submission_csv(boosted_tree, X_test_, test_data, '12')
    
    if cross_validate:
        scores = cross_val_score(
        boosted_tree, X, y, cv=10, scoring='f1_micro'
        )
        print(f'{scores.mean()} ~ {scores.std()}')
        # 0.8708482142857144 ~ 0.006533890504710872

In [None]:
f1w = make_scorer(f1_score, average='micro')

parameter_space = {
        'learning_rate':[.1, .05],
        'subsample':[1, .9],
        'n_estimators':[1000], 
        'min_samples_split': [2,3],
        'max_depth': list(range(5,9)), 
        'min_impurity_decrease': [0.0, 0.1],
        'max_features': ['auto', None],
        'n_iter_no_change': [30],
        'validation_fraction': [0.1],
        'warm_start': [False], 
        'ccp_alpha': [0.0]
}

boosted_tree_gs = GridSearchCV( GradientBoostingClassifier(random_state=0, verbose=False) , parameter_space, n_jobs=-1, scoring=f1w, cv=5, verbose=10)

if modelling_config['boosted_tree_gs']: 
    
    boosted_tree_gs.fit(X, y)

    print(f'{boosted_tree_gs.scoring}: {boosted_tree_gs.best_score_}')
    # make_scorer(f1_score, average=micro): 0.8710267857142856


In [None]:
# clf3.get_params()

clf3_params_short = {
        'ccp_alpha': 0.0,
        'learning_rate': 0.1,
        'max_depth': 6,
        'max_features': 'auto',
        'min_impurity_decrease': 0.1,
        'min_samples_split': 3,
        'n_estimators': 1000,
        'n_iter_no_change': 30,
        'subsample': 1,
        'validation_fraction': 0.1,
        'warm_start': False
    }

boosted_tree_after_gs = GradientBoostingClassifier(random_state=0, verbose=False, **clf3_params_short)

if modelling_config['boosted_tree_after_gs']: 
    
    boosted_tree_after_gs.fit(X_t, y_t)
    
    predicted = boosted_tree_after_gs.predict(X_val)

    print(classification_report(y_val, predicted))
    print('f1_micro:', f1_score(y_val, predicted, average='micro')) 
    print('Acc:', boosted_tree_after_gs.score(X_val, y_val))

    if cross_validate:
        scores = cross_val_score(
            boosted_tree_after_gs , X, y, cv=5, scoring='f1_micro'
        )
        print(f'{scores.mean()} ~ {scores.std()}')
        # 0.8706249999999999 ~ 0.004483441047058505
        
    save_submission_csv( boosted_tree_after_gs , X_test_, test_data, '11')


In [None]:
if modelling_config['boosted_tree_after_gs_complete']: 
    boosted_tree_after_gs.fit(X, y)

    # save
    save_submission_csv( boosted_tree_after_gs , X_test_, test_data, '18')

### Adaboost

In [None]:

adaboost = AdaBoostClassifier(random_state = 5)

if modelling_config['adaboost']:     

    adaboost.fit(X_t, y_t)
    predicted = adaboost.predict(X_val)

    print(classification_report(y_val, predicted))

    adaboost.score(X_val, y_val)
    
    if cross_validate:
        scores = cross_val_score(
            adaboost, X, y, cv=5, scoring='f1_micro'
        )
        print(f'{scores.mean()} ~ {scores.std()}')
        # 0.8580357142857145 ~ 0.004430677062785573

# <font color='#E8800A'>Neural Networks</font> <a class="anchor" id="eighth-bullet"></a>
  [Back to TOC](#toc)

In [None]:
mlp = MLPClassifier(
            alpha=0.000001,
            activation='relu', # [tanh, relu]
            random_state=0, verbose=False, 
            hidden_layer_sizes = (100, ),# [(100,), (50,50), (200,)]
            max_iter=1000, early_stopping=True, 
            learning_rate_init = 0.001,  # [0.05, 0.001, 0.005]
            learning_rate='constant', # ['invscaling', 'constant']
            momentum=0.9, #[0.8, 0.9]
            solver='adam', # ['adma', 'sgd']
            beta_1 = .9, #[.7, .8, .9]
            beta_2 = .999 #[.9, .99, .8]
        )

if modelling_config['mlp']: 
        
        mlp.fit(X_t, y_t)
        predicted = mlp.predict(X_val)

        print(classification_report(y_val, predicted))
        print('f1_micro:', f1_score(y_val, predicted, average='micro')) 
        # f1_micro: 0.8614285714285714
        print('Acc:', mlp.score(X_val, y_val))
        
        if cross_validate: 
                scores = cross_val_score(
                mlp, X, y, cv=5, scoring='f1_micro'
                )
                print(f'{scores.mean()} ~ {scores.std()}')
                # 0.8549107142857144 ~ 0.004171181652427915
            

In [None]:
f1w = make_scorer(f1_score, average='micro')
   
params = {
        'alpha': (10.0 ** -np.arange(1, 7)).tolist(), 
        'activation': ['tanh', 'relu'],  
        'hidden_layer_sizes': [(100,), (50,50), (200,), (50,50, 50 )], 
        'learning_rate_init': [0.05, 0.001, 0.005],
        'learning_rate':['adaptive', 'constant'], 
        'momentum':[0.8, 0.9], 
        'solver': ['adam'], #['adma', 'sgd']
        'beta_1': [.7, .8, .9], 
        'beta_2':[.9, .99, .8]
}


mlp_gs= GridSearchCV( MLPClassifier(random_state=0, verbose=False, max_iter=1000, early_stopping=True) , params, n_jobs=-1, scoring=f1w, cv=5, verbose=10)

if modelling_config['mlp_gs']:
    mlp_gs.fit(X, y)

    print(f'{mlp_gs.scoring}: {mlp_gs.best_score_}')
    #make_scorer(f1_score, average=micro): 0.8584375


In [None]:
mlp_gs_best_params = {
    'activation': 'tanh',
    'alpha': 0.0001,
    'beta_1': 0.9,
    'beta_2': 0.8,
    'hidden_layer_sizes': (50, 50, 50),
    'learning_rate': 'adaptive',
    'learning_rate_init': 0.001,
    'momentum': 0.8,
    'solver': 'adam'
}

mlp_after_gs = MLPClassifier(random_state=0, verbose=False, max_iter=1000, early_stopping=True, **mlp_gs_best_params)


if modelling_config['mlp_after_gs']: 
        mlp_after_gs.fit(X_t, y_t)
        predicted = mlp_after_gs.predict(X_val)

        print(classification_report(y_val, predicted))
        print('f1_micro:', f1_score(y_val, predicted, average='micro')) 
        # f1_micro: 0.8546428571428571
        print('Acc:', mlp_after_gs.score(X_val, y_val))
        

# <font color='#E8800A'>Ensembles</font> <a class="anchor" id="ninth-bullet"></a>
  [Back to TOC](#toc)

### VotingClassifier

In [None]:
ensemble1 = VotingClassifier(
    estimators=[('rf', random_forest), ('gbt', boosted_tree), ('mlp', mlp_after_gs)],
    voting='soft'
)

if modelling_config['ensemble1']: 
    ensemble1.fit(X_t, y_t)
    predicted = ensemble1.predict(X_val)

    print(classification_report(y_val, predicted))

    ensemble1.score(X_val, y_val)
    
    if cross_validate:
        scores = cross_val_score(
            ensemble1, X, y, cv=5, scoring='f1_micro'
        )
        print(f'{scores.mean()} ~ {scores.std()}')
        #0.8653125000000002 ~ 0.0036202026665559423

In [None]:
ensemble2 = VotingClassifier(
        estimators=[('gbt', boosted_tree), ('svc', svc_1)],
        voting='soft')

if modelling_config['ensemble2']:     

    ensemble2.fit(X_t, y_t)
    predicted = ensemble2.predict(X_val)

    print(classification_report(y_val, predicted))

    ensemble2.score(X_val, y_val)
    
    if cross_validate:
        scores = cross_val_score(
            ensemble2, X, y, cv=5, scoring='f1_micro'
        )
        print(f'{scores.mean()} ~ {scores.std()}')
        #0.8668750000000001 ~ 0.004359484148476345

### Stacking

In [None]:
estimators = [('gbt', boosted_tree), ('mlp', mlp_after_gs)]
stacking1 = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

if modelling_config['stacking1']: 
    

    stacking1.fit(X_t, y_t)
    predicted = stacking1.predict(X_val)

    print(classification_report(y_val, predicted))

    stacking1.score(X_val, y_val)
    
    if cross_validate:
        scores = cross_val_score(
            stacking1, X, y, cv=5, scoring='f1_micro'
        )
        print(f'{scores.mean()} ~ {scores.std()}')
        #0.8653125000000002 ~ 0.0036202026665559423
        
    save_submission_csv(stacking1, X_test_, test_data, '13')

In [None]:
stacking2 = StackingClassifier(
        estimators=[('gbt', boosted_tree), ('mlp', mlp_after_gs)],
        final_estimator=svc_1)

if modelling_config['stacking2']:     

    stacking2.fit(X_t, y_t)
    predicted = stacking2.predict(X_val)

    print(classification_report(y_val, predicted))

    stacking2.score(X_val, y_val)
    
    if cross_validate:
        scores = cross_val_score(
            stacking2, X, y, cv=5, scoring='f1_micro'
        )
        print(f'{scores.mean()} ~ {scores.std()}')
        # 0.8708482142857144 ~ 0.003944266537698527

In [None]:
stacking3 = StackingClassifier(
    n_jobs=-1,
    estimators=[('rf', random_forest), ('gbt', boosted_tree), ('mlp', mlp_after_gs), ('knn', knn_after_gs)],
    final_estimator=svc_1)

if modelling_config['stacking3']: 


    stacking3.fit(X_t, y_t)
    predicted = stacking3.predict(X_val)

    print(classification_report(y_val, predicted))

    stacking3.score(X_val, y_val)
    
    if cross_validate:
        scores = cross_val_score(
            stacking3, X, y, cv=5, scoring='f1_micro'
        )
        print(f'{scores.mean()} ~ {scores.std()}')
        # 0.8701339285714285 ~ 0.0036306474305977565
        # 0.86969 ~ 0.00289 rmoutliers=True

In [None]:
if modelling_config['stacking3_complete']: 
    stacking3.fit(X, y)

    # save
    save_submission_csv(stacking3 , X_test_, test_data, '20')

## Comparision

In [None]:
# compare f1 score over classes
if False: 
    if models_to_run != 'none': 
        raise Warning('For comp mode "models_to_run" should be set to "none"')

    # cross validate models 
    class_scores = []

    kf = StratifiedKFold(n_splits=5)

    fold_no = 0
    print('***fold_no: ', fold_no)
    for train_index, test_index in kf.split(X, y):
        # get the indexes of the observations assigned for each partition
        X_tt, X_vv = X[train_index], X[test_index]
        y_tt, y_vv = y[train_index], y[test_index]

        for model in model_comparison:#[0:4]:
            print(model)
            try: 
                # train model 
                model_inst = globals()[model]
                model_inst.fit(X_tt, y_tt)
                predicted = model_inst.predict(X_vv)


                # calc scores 
                f1 = f1_score(y_vv, predicted, average=None).round(2).tolist()
                f1_0 = f1[0]
                f1_1 = f1[1]
                f1_micro = f1_score(y_vv, predicted, average='micro')
            except Exception as e:
                print(str(e))
                f1_0 = np.nan
                f1_1 = np.nan
                f1_micro = np.nan
            finally: 
                class_scores.append((model, f1_0, f1_1, f1_micro, fold_no))
                class_scores
        fold_no += 1

    # build df         
    class_scores_per_fold_df = pd.DataFrame(class_scores, columns=['model', 'class0', 'class1', 'micro', 'fold_no'])
    class_scores_per_fold_df

    classScores_filename = f'{config_str}_classScores.csv'
    class_scores_per_fold_df.to_csv(os.path.join(comparison_path, classScores_filename), sep=';', index=False)



In [None]:
# plot F1 score over classes
if True:
    # prepare plot data 
    classScores_filename = 'cardinalityOriginal_rmoutlierFalse_overSamplingFalse_scaleStandard_featureselectionFalse_classScores.csv'
    class_scores_per_fold_df = pd.read_csv(os.path.join(comparison_path, classScores_filename), sep=';')
    print(class_scores_per_fold_df)
    
    class_scores_df = class_scores_per_fold_df.groupby('model')[['class0', 'class1', 'micro']].mean()

    class_scores_df['diff'] = class_scores_df.class0 - class_scores_df.class1
    class_scores_df['perc_diff'] = class_scores_df.apply(lambda row: (row.loc['class0']-row.loc['class1'])/row.loc['class0'], axis=1)
    class_scores_df.reset_index(inplace=True)
    print(class_scores_df)

    class_scores_df_wide = pd.melt(class_scores_df, id_vars='model', value_vars=['class0', 'class1'], var_name='class', value_name='f1')
    class_scores_df_wide


    # plot
    fig, ax = plt.subplots(nrows=2, sharex=True)


    sns.pointplot(data=class_scores_df_wide, x='model', y='f1', hue='class', ax=ax[0], scale=.5, order=model_comparison)

    plt.xticks(rotation=90)


    sns.pointplot(data=class_scores_df, x='model', y='perc_diff', color='black', ax=ax[1], scale=.5, order=model_comparison)
    plt.xticks(rotation=90)

    plt.savefig(os.path.join(explorations_path, 'f1_per_class.png'), dpi=200, bbox_inches = "tight")

    plt.show()


In [None]:
# compare all models for the defined preparation config

if False: 
    
    # check setup 
    print('models_to_run:',models_to_run)

    if models_to_run != 'none': 
        raise Warning('For comp mode "models_to_run" should be set to "none"')
    
    # cross validate models 
    overview = []
    exceptions = []
    
    for model in model_comparison:
        print(model)
        try: 
            model_inst = globals()[model]
            
            specs = str(model_inst).replace('\n', '')
            specs = ' '.join(specs.split())

            if True: 
                scores = cross_val_score(
                    model_inst, X, y, cv=5, scoring='f1_micro'
                )
                print(f'{scores.mean()} ~ {scores.std()}')
            mean = scores.mean()
            sd = scores.std()


        except Exception as e: 
            mean = np.nan
            sd = np.nan
            specs = ''
            exceptions.append((model, str(e)))
            
        finally:

            overview.append((model, mean, sd, specs))

    # prepare dfs and save to csv
    overview_filename = f'{config_str}_overview.csv'
    exceptions_filename = f'{config_str}_exceptions.csv'


    overview_df = pd.DataFrame(overview, columns=['model', 'mean', 'sd', 'specs'])
    overview_df.to_csv(os.path.join(comparison_path, overview_filename), sep=';')

    exceptions_df = pd.DataFrame(exceptions, columns=['model', 'exception'])
    exceptions_df.to_csv(os.path.join(comparison_path, exceptions_filename), sep=';')

In [None]:
# Compare scores over all configs and models based on the csv files generated in the cell above

#-----------------------------------------------------------------------

# collect scores from csv  files

if True: 
    configs = {}
    scores = pd.DataFrame()
    for i, path in enumerate(os.listdir(comparison_path)):
        identifier = '_overview.csv'
        if identifier in path:
            config_desc = path.replace(identifier, '')
            configs[i] = config_desc
            tmp = pd.read_csv(os.path.join(comparison_path, path), sep=';')
            tmp.drop(columns=['Unnamed: 0'], inplace=True)
            tmp['config'] = config_desc
            scores = pd.concat([scores, tmp], axis=0)
    scores.reset_index(inplace=True)
    scores
    
#-----------------------------------------------------------------------

    # plot scores 
    def plot_scores(scores, explorations_path, filename, width, height, dodge): 
        my_dpi = 200
        fig = plt.figure(
            figsize=(
                #10, 8
                width/my_dpi, height/my_dpi
            )
        )
        ax = sns.pointplot(data=scores, x='model', y='mean', hue='config', alpha=.7, dodge=dodge, join=False, scale=.5)

        # Find the x,y coordinates for each point
        x_coords = []
        y_coords = []
        for point_pair in ax.collections:
            for x, y in point_pair.get_offsets():
                x_coords.append(x)
                y_coords.append(y)

        # Calculate the type of error to plot as the error bars
        # Make sure the order is the same as the points were looped over
        #errors = tips.groupby(['smoker', 'sex']).std()['tip']
        #colors = ['steelblue']*2 + ['coral']*2
        ax.errorbar(x_coords, y_coords, yerr=scores.sd, fmt=' ', zorder=-1, color='black', capsize=2)

        plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc='lower left',
                   ncol=1, mode="expand", borderaxespad=0., prop={'size': 6})
        plt.tight_layout()
        plt.xticks(rotation=90)
        ax.set(ylabel='mean micro f1 score')


        plt.savefig(os.path.join(explorations_path, f'{filename}.png'), dpi=200, bbox_inches = "tight")


        plt.show()
       
    # generate and save plot
    plot_scores(scores, explorations_path, 'comp_all', 1200, 1400, .4)
    
#-----------------------------------------------------------------------

    # save additional infos
    best_model_scores = scores[scores.model == 'stacking3'].drop(columns=['specs', 'model', 'index'])
    best_model_scores[['mean', 'sd']] = best_model_scores[['mean', 'sd']].apply(lambda x: np.round(x,5))
    best_model_scores.to_excel(os.path.join(explorations_path, 'stacking3.xlsx'), index=False)

    models_overview =  pd.DataFrame(scores['model'].unique(), columns=['model'])
    models_overview.to_excel(os.path.join(explorations_path, 'models_overview.xlsx'), index=False)

    configs_overview = pd.DataFrame(scores['config'].unique(), columns=['config'])
    configs_overview.to_excel(os.path.join(explorations_path, 'configs_overview.xlsx'), index=False)
    
#-----------------------------------------------------------------------

    # TABLE scores_comp
    scores_comp = scores.loc[:, ['model', 'mean', 'sd', 'config']].sort_values('mean', ascending=False)#.head(10)
    scores_comp[['mean', 'sd']] = scores_comp[['mean', 'sd']].apply(lambda x: np.round(x,5))
    scores_comp['sd/mean'] = scores_comp['sd'] / scores_comp['mean']
    scores_comp.to_excel(os.path.join(explorations_path, 'scores_comp.xlsx'), index=False)

#-----------------------------------------------------------------------

    # PLOT stableTop10
    stableTop10 = scores_comp.sort_values('sd/mean').head(10)
    stableTop10['config'].str.contains('rmoutlierTrue').value_counts()

    stableTop10Configs = pd.DataFrame(stableTop10.groupby('config').size(), columns=['nobs']).reset_index().sort_values('nobs', ascending=False)
    #print(stableTop10Configs)

    configs = pd.DataFrame(scores['config'].unique(), columns=['config'])
    #sns.barplot(stableTop10Configs)
    stableTop10Configs = configs.merge(stableTop10Configs, how='left', on='config')
    stableTop10Configs.loc[stableTop10Configs.nobs.isna(), 'nobs'] = 0
    stableTop10Configs['nobs'] = stableTop10Configs['nobs'].astype(int) 
    stableTop10Configs = stableTop10Configs.sort_values('nobs', ascending=False)
    stableTop10Configs


    my_dpi = 200
    fig = plt.figure(
        figsize=(
            #10, 8
            1500/my_dpi, 1000/my_dpi
        )
    ) 
    sns.barplot(data=stableTop10Configs, x='nobs', y='config')
    plt.yticks( fontsize=8)

    plt.tight_layout()
    plt.savefig(os.path.join(explorations_path, 'stableTop10Configs.png'), dpi=200)

    plt.ylabel

#-----------------------------------------------------------------------

    # split config description 

    splits = configs.config.str.split('_', expand=True)
    print(splits.shape)
    print(splits.iloc[:,:-1])

    #['_'.join(line) for line in splits[0:3]]

#-----------------------------------------------------------------------

    # config with highest score per model 
    idx = scores_comp.groupby('model')['mean'].transform(max) == scores_comp['mean']
    scores_comp[idx]

#-----------------------------------------------------------------------
