In [None]:
import pandas as pd
import numpy as np
import os
import re


import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler


from datetime import datetime
from pandas_profiling import ProfileReport


In [None]:
# definitions 
datain_path = 'data/'

explorations_path = 'explorations/'
if not os.path.exists(explorations_path): 
    os.makedirs(explorations_path)

In [None]:
datasets = [
    ('train', 'Train.xlsx'),
    ('test', 'Test.xlsx')  
]
datasets = pd.DataFrame(datasets, columns=['name', 'path']).set_index('name')

dataset_name = 'train'
dataset_path = datasets.loc[dataset_name].path


In [None]:

data = pd.read_excel(os.path.join(datain_path, dataset_path))
data.head()

# Explorations

## profile report

profile = ProfileReport(
    data,
    title='Raw data',
    minimal=False, 
    correlations={
    "pearson": {"calculate": True},
    "spearman": {"calculate": False},
    "kendall": {"calculate": False},
    "phi_k": {"calculate": False},
    "cramers": {"calculate": False},
    }
)
profile.to_file(os.path.join(explorations_path, 'profile_data_raw.html'))


In [None]:
def get_feature_imp_by_expl(data, base_col): 
    
    # make sure every combination of levels exist, fill with 0 if no obs
    base = data[base_col].unique()
    Income = [0, 1]
    idx = pd.MultiIndex.from_product(
        [base, Income],
        names=[base_col, 'Income']
    )

    pd1 = pd.DataFrame(index=idx)
    

    a = data.groupby([base_col, 'Income']).size().to_frame().rename(columns={0:'nobs'})
    
    a = pd.concat([pd1, a], axis=1)
    a.loc[a.nobs.isna(), 'nobs'] = 0
    
    a['nobs_rel'] = a.groupby(level=base_col).transform(lambda x: x / (x[0] + x[1]))
    value_cols = a.columns.to_list()
    a.reset_index(inplace=True)

    a.sort_values(['nobs', base_col], ascending=False, inplace=True)
    return a, value_cols

def plot_feature_imp_by_expl(data, base_col): 
    
    a, value_cols = get_feature_imp_by_expl(data, base_col)
    n_plots = len(value_cols) + 1

    fig, ax = plt.subplots(ncols = n_plots  , figsize=(20,7), gridspec_kw={'width_ratios': [3,3,1]})
    for i, col in enumerate(value_cols): 
        sns.barplot(data=a, x=base_col, y=col, hue='Income', ax=ax[i])#.set_title(col) # [0:10]
        ax[i].tick_params(labelrotation=45)

    sns.countplot(data=data, x='Income', ax=ax[n_plots-1])
    plt.show()
    
def get_target_ratio(data):  
    a = data.groupby(['Income']).size().to_frame().rename(columns={0:'nobs'})
    a['nobs_rel'] = a.transform(lambda x: x / (x[0] + x[1]))
    value_cols = a.columns.to_list()
    a.reset_index(inplace=True)
    return a.loc[a.Income == 1, 'nobs_rel'].to_list()[0]


def get_feature_imp_by_target_ratio(data, base_col, weighted=False): 

    target_ratio = get_target_ratio(data)
    target_ratio
                                            
    a, _ = get_feature_imp_by_expl(data, base_col)
    
    #########
    nObsPerFeatClass =  data.groupby([base_col]).size().to_frame().rename(columns={0:'nobs'})
                           

    ratio_per_level = a.loc[a.Income == 1, [base_col,'nobs_rel']]\
        .set_index(base_col)\
        .rename(columns={'nobs_rel':'class1_ratio'})

    ratio_per_level = pd.concat([ratio_per_level, nObsPerFeatClass], axis=1)
    #min_max_scaler_obs = MinMaxScaler()
    ratio_per_level['nobs_rel'] = ratio_per_level.nobs / sum(ratio_per_level.nobs)


    
    ratio_per_level['diff_to_target'] = ratio_per_level['class1_ratio'] - target_ratio
    ratio_per_level['diff_to_target_dir'] = ['neg' if obs < 0 else 'pos' for obs in ratio_per_level['diff_to_target']]
    
    if weighted: 
        weights = np.power(ratio_per_level['nobs_rel'], 1./3)
    else: 
        weights = 1
        
    ratio_per_level['diff_to_target_abs'] = abs(ratio_per_level['diff_to_target']) * weights
    
    #print(ratio_per_level)

    ratio_per_level.sort_values('diff_to_target_abs', ascending=False, inplace=True)
    ratio_per_level['diff_to_target_abs_cumsum'] = ratio_per_level.diff_to_target_abs.cumsum()
    ratio_per_level


    x = ratio_per_level['diff_to_target_abs_cumsum'].values.reshape(-1, 1) #df.values #returns a numpy array
    min_max_scaler = MinMaxScaler()
    ratio_per_level['diff_to_target_abs_cumsum_scaled'] = min_max_scaler.fit_transform(x)

    print('TargetClass1_ratio', target_ratio)
    print(ratio_per_level[['diff_to_target_dir', 'diff_to_target_abs']])

    return ratio_per_level


def plot_feature_imp_by_target_ratio(data, base_col, weighted=False): 

    r = get_feature_imp_by_target_ratio(data, base_col, weighted)

    sns.lineplot(data=r, y=r.index, x='diff_to_target_abs_cumsum_scaled')
    plt.show()
    
    
def plot_feature_imp_by_tree(data, base_col): 
    # prepare
    onehot = OneHotEncoder()
    X_train_cat = data.loc[:,[base_col]]
    #X_train_cat = data[base_col]

    X_train_onehot = onehot.fit_transform(X_train_cat)
    X_train_onehot_df = pd.DataFrame(X_train_onehot.toarray(), columns=onehot.get_feature_names())
    X_train_onehot_df

    X_train_onehot_df = pd.get_dummies(data[base_col], prefix=base_col)

    # train
    dt_gini = DecisionTreeClassifier(random_state = 1)
    X_train = X_train_onehot_df#.drop(columns=['x0_Africa','x0_Europe', 'x0_Oceania'])
    y_train = data.Income


    dt_gini.fit(X_train, y_train) # data[base_col]
    print('Score:', dt_gini.score(X_train, y_train))

    #dt_gini.feature_importances_
    #tree.plot_tree(dt_gini)

    #plt.barh(onehot.get_feature_names(), dt_gini.feature_importances_)

    #print(dt_gini.feature_importances_)
    sorted_idx = dt_gini.feature_importances_.argsort()#[0:10]
    plt.barh(X_train.columns[sorted_idx], dt_gini.feature_importances_[sorted_idx])
    plt.xlabel("Feature Importance")
    
    plt.show()

    
def plot_feature_imp(data, base_col, force_barplot=True, weighted=False): 
    plot_feature_imp_by_expl(data, base_col)
    
    
    if (data[base_col].nunique() < 6) | force_barplot:
        n_plots = 2
        plot_feature_imp_by_target_ratio(data, base_col, weighted)
    else: 
        n_plots = 1
        
    plot_feature_imp_by_tree(data, base_col)

In [None]:
plot_feature_imp_by_expl(data, base_col)

In [None]:
pd.DataFrame({'s': list(range(5))})[0:10]

In [None]:
# 
data.info()

In [None]:
# init 
cols_to_drop = []
cols_to_onehot = []

# prep
pred_config = {
    'cardinality': 'low' # low, medium, high
} 

cardinality = pred_config['cardinality']
cardinality

In [None]:
# extract gender from name?!
salutation = data.Name.str.split(' ', n=1, expand=True)[0]
if salutation.nunique() != 3: 
    raise ValueError('Unexpected levels of salutation')
    
print(salutation.value_counts())

#gender = ['male' if s == 'Mr.' else 'female' for s in salutation]
#data['gender'] = gender

male = [1 if s == 'Mr.' else 0 if s in ['Mrs.', 'Miss'] else np.nan for s in salutation]
data['male'] = male

if data.male.isna().sum() > 0: 
    raise Warning('NAs instroduced')
    
cols_to_drop.append('Name')


In [None]:

sns.countplot(data=data, x='male')
plt.show()

sns.countplot(data=data, hue=data.Income, x='male')#.set_title(col)
plt.show()

In [None]:
# Compute age from Birthday

# clean whitespaces
data.Birthday = data.Birthday.str.replace(' ', '')
# define date format
dob_format = '%B%d,%Y'

# transform Birthday to datetime, catching the leap year error 

## helper fct to subtract one day from datetime if error occurs
def subone(obj):
    val = int(obj.group(0))
    return str(val-1)

## init and loop over dates
dob = []
error_log = []
for i, d in enumerate(data.Birthday): 
    try: 
        dob.append(datetime.strptime(d, dob_format).date())

    except ValueError as e: 
        if str(e) == 'day is out of range for month': 
            dt = datetime.strptime(re.sub('\d{1,2}', subone, d, count=1), dob_format).date()
            error_log.append((d, dt))
            dob.append(dt)
        else: 
            raise NotImplementedError('Do not know how to deal with that error!')
            dt = np.nan
            error_log.append((d, dt))
            dob.append(dt)
        
# add age column 
data['age'] = [np.floor((datetime.strptime('2048-12-31', '%Y-%m-%d').date() - d).days / 365.2425) for d in dob]

# inspect
sns.histplot(data, x='age')
plt.show()
print('Min age:' , min(data.age))

# drop date col 
cols_to_drop.append('Birthday')


In [None]:
data.info()

In [None]:
# 'Native Continent' to bin 
base_col = 'Native Continent'
target_col = 'from_europe_or_asia'
#sns.countplot(data=data, hue=data.Income, x=base_col)#.set_title(col)
#plt.show()

plot_feature_imp(data, base_col, weighted=False)

#data['from_europe'] = [1 if a == 'Europe' else 0 for a in data[base_col]]
data[target_col] = [1 if a in ['Europe', 'Asia'] else 0 for a in data[base_col]]

sns.countplot(data=data, x=target_col, hue='Income')
plt.show()

cols_to_drop.append(base_col)


In [None]:
# Marital Status
base_col = 'Marital Status'
#target_col = 'marital_status'

data[base_col].value_counts()

plot_feature_imp(data, base_col, weighted=False)

if (cardinality == 'low'): 
    target_col = 'maritalStatus_married'
    data[target_col] = [1 if a in ['Married', 'Married - Spouse in the Army'] else 0 for a in data[base_col]]
else: 
    target_col = 'maritalStatus'
    mapping = {
        'Married':'Married',
        'Single':'Single',
        'Divorced':'Divorced',
        'Separated':'Separated',
        'Widow':'Widow',
        'Married - Spouse Missing':'SpouseMissing',
        'Married - Spouse in the Army':'Married'
    }

    data[target_col] = data[base_col].map(mapping)
    cols_to_onehot.append(target_col)

#sns.countplot(data=data, x=target_col)
sns.countplot(data=data, x=target_col, hue='Income')
plt.show()

cols_to_drop.append(base_col)


In [None]:
# Lives with


base_col = 'Lives with'
print(data[base_col].value_counts())
plot_feature_imp(data, base_col, weighted=False)

if(cardinality == 'low'): 
    target_col = 'household_livesWithPartner'
    data[target_col] = [1 if a in ['Wife', 'Husband'] else 0 for a in data[base_col]]
else: 
    target_col = 'household'
    mapping = {
        'Wife': 'Partner',
        'Other Family': 'Family',
        'Children': 'Children',
        'Alone': 'Alone',
        'Husband': 'Partner',
        'Other relatives': 'Family'
    }

    print(mapping)

    data[target_col] = data[base_col].map(mapping)
    cols_to_onehot.append(target_col)


sns.countplot(data=data, x=target_col, hue='Income')
plt.show()

cols_to_drop.append(base_col)


In [None]:
# 'Base Area' to bin 
base_col = 'Base Area'

plot_feature_imp(data, base_col, weighted=False)


data['basearea_northbury'] = [1 if a == 'Northbury' else 0 for a in data[base_col]]
sns.countplot(data=data, x='basearea_northbury', hue='Income')
plt.show()

test = data[['Income', base_col]].copy()
test['basearea_fanfoss'] = [1 if a == 'Fanfoss' else 0 for a in test[base_col]]
sns.countplot(data=test, x='basearea_fanfoss', hue='Income')
plt.show()

cols_to_drop.append(base_col)

In [None]:
# Education Level 
base_col = 'Education Level'
target_col = 'education'

#print(data[base_col].value_counts())

edu_mapping = pd.read_excel(os.path.join(datain_path, 'edu_mapping_2.xlsx'), 'Tabelle2')
mapping_options = ['level_0', 'level_1', 'numeric']
m_option = mapping_options[0]

#mapping = dict(edu_mapping[['name', mapping_options[2]]].set_index('name'))
#mapping = {k:v for k,v in edu_mapping[['name', mapping_options[2]]].set_index('name').items()}
#mapping = edu_mapping[['name', mapping_options[2]]].set_index('name')
mapping = edu_mapping[['name', m_option]].rename(columns={m_option: target_col})

print(mapping)

data = data.merge(mapping, left_on=base_col, right_on='name', how='left')
data.drop(columns=['name'], inplace=True)  



sns.histplot(data=data, x=target_col)
plt.show()

cols_to_drop.append(base_col)
#cols_to_onehot.append(target_col)


data[[base_col, target_col]]

In [None]:
# years of education 
base_col = 'Years of Education'
target_col = 'education_years'
data.rename(columns={base_col: target_col}, inplace=True)

data.head()
#sns.histplot(data=data, y=target_col)

In [None]:
# Employment Sector
base_col = 'Employment Sector'
target_col = 'empl_sector'

print(data[base_col].value_counts())

mapping = {
    'Private Sector - Services ': 'private',
    'Self-Employed (Individual)': 'self',
    'Public Sector - Others': 'public',
    '?': 'unknown',
    'Private Sector - Others': 'private',
    'Self-Employed (Company)': 'self',
    'Public Sector - Government': 'public',
    'Unemployed': 'delete',
    'Never Worked': 'delete'
    }

print(mapping)
    
data[target_col] = data[base_col].map(mapping)

sns.countplot(data=data, x=target_col)
plt.show()

cols_to_drop.append(base_col)
cols_to_onehot.append(target_col)

In [None]:
# role
base_col = 'Role'
target_col = 'empl_role'

mapping = {
    'Professor': 'Professor',
    'Management': 'Management',
    'Repair & constructions': 'blue_collor',
    'Administratives': 'Administratives',
    'Sales': 'Administratives',
    'Other services': 'Services',
    'Machine Operators & Inspectors': 'blue_collor',
    '?': 'unknown',
    'Transports': 'blue_collor',
    'Cleaners & Handlers': 'blue_collor',
    'Agriculture and Fishing': 'blue_collor',
    'IT': 'Administratives',
    'Security': 'blue_collor',
    'Household Services': 'blue_collor',
    'Army': 'blue_collor'
}

print(data[base_col].value_counts())

print(mapping)
    
data[target_col] = data[base_col].map(mapping)

sns.countplot(data=data, x=target_col)
plt.show()

cols_to_drop.append(base_col)
cols_to_onehot.append(target_col)

In [None]:
# Working Hours per week
base_col = 'Working Hours per week'
target_col = 'working_hrs_week'

data.rename(columns={base_col: target_col}, inplace=True)

sns.histplot(data=data, x=target_col)
plt.show()

data.head()



In [None]:
# Money Received
base_col = 'Money Received'
target_col = 'group_b_received_money'

data[target_col] = [1 if v != 0 else 0 for v in data[base_col]]

cols_to_drop.append(base_col)


sns.countplot(data=data, x=target_col)
plt.show()

data[[base_col, target_col]]



In [None]:
# Ticket Price
base_col = 'Ticket Price'
target_col = 'group_c_payed'

data[target_col] = [1 if v != 0 else 0 for v in data[base_col]]

cols_to_drop.append(base_col)


sns.countplot(data=data, x=target_col)
plt.show()

data[[base_col, target_col]]

In [None]:
# drop cols 
data.drop(columns=cols_to_drop, inplace=True)

In [None]:
## profile report

profile = ProfileReport(
    data,
    title=f'Cleaned data {dataset_name}' ,
    minimal=False, 
    correlations={
    "pearson": {"calculate": True},
    "spearman": {"calculate": False},
    "kendall": {"calculate": False},
    "phi_k": {"calculate": False},
    "cramers": {"calculate": False},
    }
)
profile.to_file(os.path.join(explorations_path, f'profile_data_cleaned_{dataset_name}.html'))


# Explorations

In [None]:
data.isna().sum()

In [None]:
# target distribution

sns.countplot(data=data, x='Income')
plt.show()

In [None]:
# Prepare figure
fig = plt.figure(figsize=(10, 8))

# Obtain correlation matrix. Round the values to 2 decimal cases. Use the DataFrame corr() and round() method.
corr = np.round(data.corr(method="pearson"), decimals=2)

# Build annotation matrix (values above |0.5| will appear annotated in the plot)
mask_annot = np.absolute(corr.values) >= 0.5
annot = np.where(mask_annot, corr.values, np.full(corr.shape,"")) # Try to understand what this np.where() does

# Plot heatmap of the correlation matrix
sns.heatmap(data=corr, annot=annot, cmap=sns.diverging_palette(220, 10, as_cmap=True), 
            fmt='s', vmin=-1, vmax=1, center=0, square=True, linewidths=.5)

# Layout
fig.subplots_adjust(top=0.95)
fig.suptitle("Correlation Matrix", fontsize=20)

plt.savefig(os.path.join(explorations_path, 'correlation_matrix.png'), dpi=200)
plt.show()

In [None]:
# distributions 

ncols = 4
n_plots = data.shape[1]
nrows = int(np.ceil(n_plots/ncols))



fig, ax = plt.subplots(ncols=ncols, nrows=nrows, figsize=(15,13))
col_no = 0
for i in range(nrows):
    for j in range(ncols): 
        if col_no < n_plots:
            col = data.columns[col_no]
            print(col)
            g = sns.histplot(data=data, hue=data.Income, x=col, ax=ax[i,j], bins=30).set_title(col)
            ax[i,j].tick_params(labelrotation=45)
            col_no +=1

fig.tight_layout()

plt.savefig(os.path.join(explorations_path, 'distributions.png'), dpi=200)
plt.show()


# Feature Engineering ideas
- age + household: age diff to mean of hh group
- 

### Imputations: 
- empl_sector == unkown

In [None]:


# one hot encode 
#pd.get_dummies(data=data, columns=cols_to_onehot)

# Data Preparation

In [None]:
# prep config

prep_config = {
    'upscaling': False, 
    'normalize': False
}

## upcaling to cope with class imbalance 

## Normalizing data 