In [None]:
# Import the neccessary modules for data manipulation and visual representation
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib as matplot
import seaborn as sns
%matplotlib inline

import scipy.stats as stats

from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score, precision_recall_curve

from catboost import CatBoostClassifier

import os
print(os.listdir("../input"))

### Data import

In [None]:
# Read the csv files and store the datasets into train and test
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
submission = pd.read_csv('../input/sample_submission.csv')

In [None]:
y = train.is_promoted

In [None]:
print("Train shape: {}\n Test shape: {}\n".format(train.shape, test.shape))

In [None]:
# Check how many people are promoted (whether we have an imbalanced dataset)
promo_rate = train.is_promoted.value_counts() / len(train)
promo_rate

In [None]:
# Concatenate train and test
df = pd.concat([train.drop(['employee_id', 'is_promoted'], axis=1), 
                test.drop('employee_id', axis=1)], axis=0)
ntrain = train.shape[0]
ntest = test.shape[0]

In [None]:
df.shape

In [None]:
df.head()

In [None]:
# Check to see if there are any missing values in data set
df.isnull().any()

### Feature Engineering

In [None]:
df['dept_region'] = df['department'] + df['region']

df['work_fraction'] = df['length_of_service'] / df['age']
train['work_fraction'] = train['length_of_service'] / train['age']

df['department_rating_mean'] = df.groupby('department')['previous_year_rating'].transform('mean')
df['department_score_mean'] = df.groupby('department')['avg_training_score'].transform('mean')

df['department_rating_mean'] = df['previous_year_rating']/df['department_rating_mean'] 
df['department_score_mean'] = df['avg_training_score']/df['department_score_mean'] 

df['dept_region_rating_mean'] = df.groupby('dept_region')['previous_year_rating'].transform('mean')
df['dept_region_score_mean'] = df.groupby('dept_region')['avg_training_score'].transform('mean')

df['dept_region_rating_mean'] = df['previous_year_rating']/df['dept_region_rating_mean'] 
df['dept_region_score_mean'] = df['avg_training_score']/df['dept_region_score_mean'] 

### Replace NA values with `'NA'` for `education` and `0` for `previous_year_rating` 

In [None]:
df.education.fillna('NA', inplace=True)
df.previous_year_rating.fillna(0, inplace=True)

In [None]:
df.education.value_counts()

In [None]:
# Check the type of our features. 
df.dtypes

In [None]:
cats = [col for col in df.columns if not pd.api.types.is_numeric_dtype(df[col].dtype)]

In [None]:
cats = cats + ['KPIs_met >80%','awards_won?', 'previous_year_rating']

In [None]:
cats

### Basic EDA

In [None]:
# Display the statistical overview of the employees
df.describe().drop('count', axis=0).T

In [None]:
# Overview of summary (promoted VS not promoted)
promo_summary = train.drop('employee_id', axis=1).groupby('is_promoted')
promo_summary.mean().T

In [None]:
#Correlation Matrix
corr = train.drop('employee_id', axis=1).corr()
corr = (corr)
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values);

In [None]:
sns.distplot(train.avg_training_score[train.is_promoted==1], color='blue');
sns.distplot(train.avg_training_score[train.is_promoted==0], color='red');

In [None]:
# Try additionally binning the avg_training_score
avg_score = pd.cut(df.avg_training_score, 
                   [0, 45, 55, 65, 75, 85, 95], 
                   labels=False, 
                   retbins=False, 
                   right=False)
df['avg_training_score_bin'] = avg_score

In [None]:
cats = cats + ['avg_training_score_bin']

In [None]:
# Count Plot (a.k.a. Bar Plot)
sns.countplot(x='department', data=df).set_title('Employee Department Distribution');
plt.xticks(rotation=-45);

In [None]:
f, ax = plt.subplots(figsize=(15, 5))
sns.countplot(y="department", hue='is_promoted', data=train).set_title('Employee Department Promotion');

### Prepare the categorical variables

In [None]:
le = LabelEncoder()
for c in cats:
    df[c] = le.fit_transform(df[c])

In [None]:
cats_idx = [df.columns.get_loc(c) for c in cats]

In [None]:
cats_idx

### Model training

In [None]:
clf=CatBoostClassifier(iterations=800,
                       #depth = 3,
                       #bagging_temperature = 1,
                       cat_features=cats_idx, 
                       use_best_model=True,
                       silent = True,
                       random_state = 23,
                       eval_metric = 'F1',
                       #l2_leaf_reg = 10,
                       od_type = 'Iter',
                       od_wait = 200,
                       loss_function='CrossEntropy')

In [None]:
st_train = df.iloc[:ntrain, :].values
st_test = df.iloc[ntrain:, :].values

In [None]:
n_folds = 10

In [None]:
def scoring(y_true, y_hat):
    
    ths = []
    f1scores = []
    for th in np.arange(0.0, 1.0, 0.005):
        y_hat_bin = y_hat>=th
        f1scores.append(f1_score(y_true, y_hat_bin))
        ths.append(th)
    
    return np.max(f1scores), ths[np.argmax(f1scores)]

In [None]:
def cat_train(random_state):
    cv = StratifiedKFold(y, n_folds=n_folds, shuffle=True, random_state=random_state)
    X_proba = np.zeros(ntrain)
    X_preds = np.zeros(ntrain)
    preds = np.zeros(ntest)
    for i, (tr, ts) in enumerate(cv):
        mod = clf.fit(st_train[tr], y[tr],
                     eval_set=(st_train[ts], y[ts]))
        X_proba[ts] = mod.predict_proba(st_train[ts])[:,1]
        score = roc_auc_score(y[ts], X_proba[ts])
        
        score_valid, best_th = scoring(y[ts], X_proba[ts]) 
        X_preds[ts] = (X_proba[ts]>=best_th) * 1
        f1 = f1_score(y[ts], X_preds[ts])
        
        pred = mod.predict_proba(st_test)[:,1]
        preds += (pred>=best_th) * 1
                              
        print("fold {}, ROC AUC: {:.3f}, F1 Score: {:.5f}".format(i,score,f1))
              
    score = roc_auc_score(y, X_proba)
    f1 = f1_score(y, X_preds)
    print("Full ROC AUC: {:.3f}, F1 Score: {:.5f}".format(score, f1))
    preds = (preds >= 5) * 1
    return X_proba, X_preds, preds, score, f1

In [None]:
preds_folds = []
X_preds_folds = []
X_proba_folds = []
auc_folds = []
f1_folds = []

for i in range(10):
    X_proba, X_preds, preds, score, f1 = cat_train(40+i)
    auc_folds.append(score)
    f1_folds.append(f1)
    preds_folds.append(preds)
    X_preds_folds.append(X_preds)
    X_proba_folds.append(X_proba)

### Model evaluation

In [None]:
X_proba = np.mean(X_proba_folds, axis=0)

In [None]:
score = roc_auc_score(y, X_proba)
f1 = f1_score(y, [round(value) for value in X_proba])
print("Total ROC AUC: {:.3f}, F1 Score: {:.5f}".format(score, f1))

In [None]:
X_proba = X_proba_folds[0]
for i in range(1, 9):
    X_proba *= X_proba_folds[i]
X_proba = np.power(X_proba, 1/10)

In [None]:
score = roc_auc_score(y, X_proba)
f1 = f1_score(y, [round(value) for value in X_proba])
print("Total ROC AUC: {:.3f}, F1 Score: {:.5f}".format(score, f1))

In [None]:
X_preds = np.zeros_like(y)

for i in range(10):
    score_valid, best_th = scoring(y, X_proba_folds[i]) 
    X_preds += (X_proba_folds[i]>=best_th) * 1

X_preds = (X_preds >= 5) * 1
f1 = f1_score(y, X_preds)
print("F1 Score: {:.5f}".format(f1))

In [None]:
X_preds = (pd.DataFrame(X_preds_folds).T.apply(sum, axis=1) >= 5) * 1
f1 = f1_score(y, X_preds)
print("F1 Score: {:.5f}".format(f1))

In [None]:
preds = (pd.DataFrame(preds_folds).T.apply(sum, axis=1) >= 5) * 1

In [None]:
submission.is_promoted = preds
submission.to_csv('submission.csv', index=False)