# 1. Package Import

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 2. Data Load

In [None]:
df = pd.read_csv("../input/churn-modelling/Churn_Modelling.csv")

# 3. Data Preprocessing

## 3.1. EDA

In [None]:
X = df.drop('Exited', axis=1)
y = df[['Exited']]

data = [X, y]
for datum in data:
    print(datum.head())
    print()
    print(datum.isnull().sum())
    print()
    print(datum.describe())
    print()
    print(datum.info())
    print()
    print(datum.corr())
    print()
    print(datum.columns)
    print('*  ' * 30)

In [None]:
del_cols = ['RowNumber', 'CustomerId', 'Surname']
num_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']
cat_cols = ['Geography', 'Gender', 'NumOfProducts', 'HasCrCard', 'IsActiveMember']
y_col = ['Exited']

## 3.2. Detect Outliers

In [None]:
train = pd.concat([X, y], axis=1)

def Counter(x):
    Dict = {}
    for y in x:
        if y in Dict:
            Dict[y] +=1
        else:
            Dict[y] = 1
    return Dict

def detect_outliers(df, n, features):
    outlier_indices = []
    for col in features:
        q1 = np.percentile(df[col], 25)
        q3 = np.percentile(df[col], 75)
        iqr = q3 - q1
        outlier_step = iqr * 1.5
        outlier_rows = df[(df[col] < q1 - outlier_step)|(df[col] > q3 + outlier_step)].index
        outlier_indices.extend(outlier_rows)
    outlier_indices = Counter(outlier_indices)
    multi_outliers = list(k for k, v in outlier_indices.items() if v >= n)
    return multi_outliers

train = train.drop(detect_outliers(train, 1, num_cols)).reset_index(drop=True)
print(X.shape, train.shape)

X_train = train.iloc[:,:-1]
y_train = train.iloc[:,-1:]

## 3.3. Encoding

In [None]:
X_train.Gender = X_train.Gender.str.lstrip().str.lower()
for col in cat_cols:
    print(X_train[col].unique())

enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(X[cat_cols])
X_train_enc = pd.DataFrame(enc.transform(X_train[cat_cols]).toarray(), columns = enc.get_feature_names_out())

# 4. Data Set Split

In [None]:
X = pd.concat([X_train[num_cols], X_train_enc], axis=1)
y = y_train[y_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify = y)

# 5. Data Scaling

In [None]:
scaler = StandardScaler()
scaler.fit(X_train[num_cols])
X_train[num_cols] = scaler.transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# 6. Data Modeling

In [None]:
rf1 = RandomForestClassifier(max_depth=2, random_state=0, n_jobs=-1)
rf1.fit(X_train, y_train.values.ravel())
rf1_pred = rf1.predict(X_test)

xgb1 = XGBClassifier(random_state=0, n_jobs=-1)
xgb1.fit(X_train, y_train)
xgb1_pred = xgb1.predict(X_test)

# 7. Model Score

In [None]:
rf1_score = f1_score(y_test, rf1_pred, average = 'macro')
xgb1_score = f1_score(y_test, xgb1_pred, average = 'macro')
print(f'Before Tuning -> random forest score: {rf1_score}, xgboost score: {xgb1_score}')

# 8. Hyper Parameter tuning

In [None]:
rf1_parameters = {
    'max_depth':[2,4,8], 
    'min_samples_leaf': [1,2,3],
    'min_samples_split': [2,4,6],
    'n_estimators': [100,500,1000]
    }
rf1_cv = GridSearchCV(rf1, rf1_parameters, scoring = 'f1_macro', cv=5)
rf1_cv.fit(X_train, y_train.values.ravel())
print(rf1_cv.best_params_)

xgb1_parameters = {
    'colsample_bytree': [0.5, 1],
    'learning_rate': [0.1, 0.01, 0.2],
    'max_depth': [3,6,9],
    'n_estimators': [100,500,1000]
}
xgb1_cv = GridSearchCV(xgb1, xgb1_parameters, scoring = 'f1_macro', cv=5)
xgb1_cv.fit(X_train, y_train)
print(xgb1_cv.best_params_)

In [None]:
clf2 = RandomForestClassifier(max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=0, n_jobs=-1)
clf2.fit(X_train, y_train.values.ravel())
clf2_pred = clf2.predict(X_test)
clf2_score = f1_score(y_test, clf2_pred, average = 'macro')

xgb2 = XGBClassifier(colsample_bytree=0.5, learning_rate=0.1, max_depth=6, n_estimators=100, random_state=0, n_jobs=-1)
xgb2.fit(X_train, y_train)
xgb2_pred = xgb2.predict(X_test)
xgb2_score = f1_score(y_test, xgb2_pred, average = 'macro')
print(f'After Tuning -> random forest score: {clf2_score}, xgboost score: {xgb2_score}')