In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.metrics import precision_score, recall_score, accuracy_score, auc, f1_score, confusion_matrix, log_loss
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from category_encoders import CatBoostEncoder
from sklearn.preprocessing import normalize, RobustScaler
import missingno as msn

In [None]:
train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
greek = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')

In [None]:
train.head(5)

In [None]:
train = train.drop(['Id'], axis=1)

In [None]:
# Here we can see disbalance of classes. 
# we can use upsampling of minority, downsampling of majority or classweights
train.Class.hist()

In [None]:
# As we can see, most of missing values belong to EL and BQ
msn.bar(train)

In [None]:
# Checking dtypes of missing values and fillna with median
for i in train.columns:
    if train[i].isna().sum()>0:
        print(i, train[i].dtype)
        train[i] = train[i].fillna(train[i].median())
        print('Filled')

In [None]:
# train dataset has no missing values
train.isna().sum().hist()

In [None]:
# The average values and SE of our characteristics differ from each other,
fig, ax = plt.subplots(2)
ax[0].plot(train.sem(axis=0))
ax[1].plot(train.mean(axis=0))

In [None]:
# As we can observe, our data contains outliers. Therefore, we must use outlier-resistant standardization.
a,b,c = 12, 5, 1
fig = plt.figure(figsize = (15,35))
for i in list(train.drop(['EJ'], axis=1)):
    plt.subplot(a, b, c)
    plt.ylabel(i)
    sns.boxplot(data=train, y=i)
    c += 1
plt.show()

In [None]:
# Let divide numerical and categorial features
train_numerical = train.drop(['EJ', 'Class'], axis=1)
train_categorial = train[['EJ', 'Class']]

In [None]:
# This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range). 
#The IQR is the range between the 1st quartile (25th quantile) and the 3rd quartile (75th quantile)

scaled_train_numerical = pd.DataFrame(RobustScaler().fit_transform(train_numerical), columns=list(train_numerical))

In [None]:
scaled_train_numerical.describe()

In [None]:
train_categorial['EJ'] = pd.get_dummies(train_categorial['EJ'], drop_first=True)

In [None]:
train_categorial.head(5)

In [None]:
train_final = scaled_train_numerical.join(train_categorial)

In [None]:
def cross_validation(model, _X, _y, _cv=5):
      '''Function to perform 5 Folds Cross-Validation
       Parameters
       ----------
      model: Python Class, default=None
              This is the machine learning algorithm to be used for training.
      _X: array
           This is the matrix of features.
      _y: array
           This is the target variable.
      _cv: int, default=5
          Determines the number of folds for cross-validation.
       Returns
       -------
       The function returns a dictionary containing the metrics 'accuracy', 'precision',
       'recall', 'f1' for both training set and validation set.
      '''
      scoring = ['accuracy', 'precision', 'recall', 'f1']
      results = cross_validate(estimator=model,
                               X=_X,
                               y=_y,
                               cv=_cv,
                               scoring=scoring,
                               return_train_score=True)
      
      return {"Training Accuracy scores": results['train_accuracy'],
              "Mean Training Accuracy": results['train_accuracy'].mean()*100,
              "Training Precision scores": results['train_precision'],
              "Mean Training Precision": results['train_precision'].mean(),
              "Training Recall scores": results['train_recall'],
              "Mean Training Recall": results['train_recall'].mean(),
              "Training F1 scores": results['train_f1'],
              "Mean Training F1 Score": results['train_f1'].mean(),
              "Validation Accuracy scores": results['test_accuracy'],
              "Mean Validation Accuracy": results['test_accuracy'].mean()*100,
              "Validation Precision scores": results['test_precision'],
              "Mean Validation Precision": results['test_precision'].mean(),
              "Validation Recall scores": results['test_recall'],
              "Mean Validation Recall": results['test_recall'].mean(),
              "Validation F1 scores": results['test_f1'],
              "Mean Validation F1 Score": results['test_f1'].mean()
              }

In [None]:
def plot_result(x_label, y_label, plot_title, train_data, val_data):
        '''Function to plot a grouped bar chart showing the training and validation
          results of the ML model in each fold after applying K-fold cross-validation.
         Parameters
         ----------
         x_label: str, 
            Name of the algorithm used for training e.g 'Decision Tree'
         y_label: str, 
            Name of metric being visualized e.g 'Accuracy'
         plot_title: str
         train_result: list, array
            This is the list containing either training precision, accuracy, or f1 score.
         val_result: list, array
            This is the list containing either validation precision, accuracy, or f1 score.
         Returns
         -------
         The function returns a Grouped Barchart showing the training and validation result
         in each fold.
        '''
        
        # Set size of plot
        plt.figure(figsize=(8,4))
        labels = ["1st Fold", "2nd Fold", "3rd Fold", "4th Fold", "5th Fold"]
        X_axis = np.arange(len(labels))
        ax = plt.gca()
        plt.ylim(0.40000, 1)
        plt.bar(X_axis-0.2, train_data, 0.4, color='blue', label='Training')
        plt.bar(X_axis+0.2, val_data, 0.4, color='red', label='Validation')
        plt.title(plot_title, fontsize=30)
        plt.xticks(X_axis, labels)
        plt.xlabel(x_label, fontsize=14)
        plt.ylabel(y_label, fontsize=14)
        plt.legend()
        plt.grid(True)
        plt.show()

In [None]:
# Calculate class weigts
pos, neg = np.bincount(train_final.Class)
class_weights = {0: len(train_final.Class)/(2*neg), 1: len(train_final.Class)/(2*pos) }
print(class_weights)

In [None]:
X = train_final.drop(['Class'], axis=1)
y = train_final.Class

In [None]:
# Define grid params to logRegression
log_param = {'penalty': ['l2'], 'C': np.logspace(-3,3,7),
             'class_weight':[class_weights, {0: 0.5, 1: 0.5}]}

In [None]:
log_reg = GridSearchCV(LogisticRegression(max_iter=3000, solver='lbfgs'), param_grid=log_param, cv=5).fit(X,y)

In [None]:
log_reg_res = cross_validation(log_reg.best_estimator_, X, y)
log_reg_res

In [None]:
plot_result('log_reg', 'Accuracy', 'Accuracy by Folds', 
             log_reg_res['Training Accuracy scores'],
             log_reg_res['Validation Accuracy scores'])

In [None]:
# Very small Recall
plot_result('log_reg', 'Recall', 'Recall by Folds', 
             log_reg_res['Training Recall scores'],
             log_reg_res['Validation Recall scores'])

In [None]:
# Define Random Forest grid params
rf_param = {'n_estimators': range(100, 300, 50), 
            'criterion': ['gini', 'entropy'],
            'max_depth': range(3, 7),
            'min_samples_split': [2, 3, 4],
            'min_samples_leaf': [1, 2, 3],
            'class_weight':[class_weights, {0: 0.5, 1: 0.5}]}

In [None]:
rf = GridSearchCV(RandomForestClassifier(), param_grid=rf_param).fit(X, y)

In [None]:
rf.best_params_

In [None]:
rf_res = cross_validation(rf.best_estimator_, X, y)
rf_res

In [None]:
plot_result('rf_res', 'Accuracy', 'Accuracy by Folds', 
             rf_res['Training Accuracy scores'],
             rf_res['Validation Accuracy scores'])

In [None]:
# Very small Recall
plot_result('rf_res', 'Recall', 'Recall by Folds', 
             rf_res['Training Recall scores'],
             rf_res['Validation Recall scores'])

In [None]:
# KNN Classificator
knn_param = {'n_neighbors': (1, 3, 5, 7, 9),
             'leaf_size': (20,40,1), 'p': (1,2),
             'weights': ['uniform', 'distance'],
             'p': [1, 2, 3, 4, 5]}

In [None]:
knn = GridSearchCV(KNeighborsClassifier(), param_grid=knn_param, cv=5).fit(X, y)

In [None]:
knn.best_params_

In [None]:
knn_res = cross_validation(knn.best_estimator_, X, y)
knn_res

In [None]:
plot_result('knn_res', 'Accuracy', 'Accuracy by Folds', 
             knn_res['Training Accuracy scores'],
             knn_res['Validation Accuracy scores'])

In [None]:
plot_result('knn_res', 'Accuracy', 'Accuracy by Folds', 
             knn_res['Training Accuracy scores'],
             knn_res['Validation Accuracy scores'])

In [None]:
# train XGBoost
xgb_param = {'max_depth': range (3, 7, 1),
            'n_estimators': range(50, 200, 50),
             'gamma': [0, 0.1, 0.2],
            'learning_rate': [0.1, 0.01, 0.05],
            'reg_alpha': [0, 1e-2, 1, 1e1],
            'reg_lambda': [0, 1e-2, 1, 1e1]}

In [None]:
xgb = GridSearchCV(XGBClassifier(), param_grid=xgb_param, cv=5).fit(X,y)

In [None]:
xgb.best_params_

In [None]:
xgb.best_params_

In [None]:
xgb_res = cross_validation(xgb.best_estimator_, X, y)

In [None]:
plot_result('xgb_res', 'Accuracy', 'Accuracy by Folds', 
             xgb_res['Training Accuracy scores'],
             xgb_res['Validation Accuracy scores'])

In [None]:
plot_result('xgb_res', 'Recall', 'Recall by Folds', 
             xgb_res['Training Recall scores'],
             xgb_res['Validation Recall scores'])

In [None]:
plot_result('xgb_res', 'Precision', 'Precision by Folds', 
             xgb_res['Training Precision scores'],
             xgb_res['Validation Precision scores'])

In [None]:
plot_result('xgb_res', 'F1', 'F1 by Folds', 
             xgb_res['Training F1 scores'],
             xgb_res['Validation F1 scores'])

In [None]:
cat_param = {'depth': [4,5,6,7,8],
              'learning_rate': [0.02,0.03,0.04],
              'iterations': range(100, 300, 50),
              'l2_leaf_reg': range(1, 9, 2)}

In [None]:
cat = GridSearchCV(CatBoostClassifier(), param_grid=cat_param, cv=5).fit(X,y)

In [None]:
cat_res = cross_validation(cat.best_estimator_, X, y)

In [None]:
plot_result('cat_res', 'Accuracy', 'Accuracy by Folds', 
             cat_res['Training Accuracy scores'],
             cat_res['Validation Accuracy scores'])

In [None]:
plot_result('cat_res', 'Recall', 'Recall by Folds', 
             cat_res['Training Recall scores'],
             cat_res['Validation Recall scores'])

In [None]:
plot_result('cat_res', 'Precision', 'Precision by Folds', 
             cat_res['Training Precision scores'],
             cat_res['Validation Precision scores'])

In [None]:
plot_result('cat_res', 'F1', 'F1 by Folds', 
             cat_res['Training F1 scores'],
             cat_res['Validation F1 scores'])

In [None]:
test_cat = test[['Id', 'EJ']]
test_num = test.drop(['EJ'. 'Id'], axis=1)
test_num = pd.DataFrame(RobustScaler().fit_transform(test_num), columns=list(test))
test_cat['EJ'] = pd.get_dummies(test['EJ'], drop_first=True)
test = test_cat.join(test_num)

In [None]:
test

In [None]:
submission = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')

In [None]:
submission[['class_0', 'class_1']] = cat.best_estimator_.predict_proba(test.drop(['Id'], axis=1))

In [None]:
submission.to_csv('submission.csv', index=False)