In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import xgboost as xgb

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score ,roc_curve,accuracy_score

In [None]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

In [None]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
train.head()

In [None]:
train.info()

In [None]:
train.head(3).T

In [None]:
cat_col = [col for col in train.columns if '_cat' in col]
print(cat_col)

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(train['ps_ind_01'],palette='rainbow')

In [None]:
plt.figure(figsize=(10,6))
plt.subplot(121)
sns.countplot(train['ps_ind_01'],palette='rainbow')
plt.subplot(122)
sns.countplot(train['ps_ind_02_cat'],palette='rainbow')

In [None]:
#Covert int/float varaible into category data
col = train.columns
for i in col:
    if train[i].nunique()<=30 and i != 'target':
        train[i] = train[i].astype('category')
        test[i] = test[i].astype('category')

In [None]:
train.info()

In [None]:
train.target.value_counts()

In [None]:
train.target.value_counts(normalize=True)

In [None]:
sns.countplot(train.target ,palette="Set2")

In [None]:
sns.set(style="white")


# Compute the correlation matrix
corr = train.corr(method='spearman')

In [None]:
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

plt.show()

In [None]:
def OHE(df):
    cat_col = df.select_dtypes(include =['category']).columns
    
    c2,c3 = [],{}
    
    print('Categorical feature',len(cat_col))
    for c in cat_col:
        if df[c].nunique()>2 :
            c2.append(c)
            c3[c] = 'ohe_'+c
    
    df = pd.get_dummies(df, prefix=c3, columns=c2,drop_first=True,dummy_na=True)

    #df = df.drop(c2,axis=1)
    print(df.shape)
    return df

In [None]:
train2 = OHE(train)
test2 = OHE(test)

In [None]:
def missing_value(df):
    col = df.columns
    for i in col:
        if df[i].isnull().sum()>0:
            df[i].fillna(df[i].mode()[0],inplace=True)


            

In [None]:
missing_value(train3)
missing_value(test3)

In [None]:
X = train2.drop(['target','id'],axis=1)
y = train2['target']
x_test = test2.drop('id',axis=1)


In [None]:
k=3
kf = StratifiedKFold(n_splits=3,random_state=42,shuffle=True)
pred_test_full=0
cv_score=[]
i=1
for train_index,test_index in kf.split(X,y):    
    print('\n{} of kfold {}'.format(i,kf.n_splits))
    xtr,xvl = X.loc[train_index],X.loc[test_index]
    ytr,yvl = y.loc[train_index],y.loc[test_index]
    
    lr = LogisticRegression(class_weight='balanced',verbose = 1,)
    lr.fit(xtr,ytr)
    pred_test = lr.predict_proba(xvl)[:,1]    
    cv_score.append(roc_auc_score(yvl,pred_test))
    pred_test_full += lr.predict_proba(x_test)[:,1]
    i+=1

In [None]:
print('Cv',cv_score,'\nMean cv Score',np.mean(cv_score))

In [None]:
y_pred = pred_test_full/k
submit = pd.DataFrame({'id':test['id'],'target':y_pred})
submit.to_csv('lr_porto.csv',index=False)
submit.head()

In [None]:
kfold = 5
skf = StratifiedKFold(n_splits=kfold, random_state=42)

In [None]:
params = {
    'min_child_weight': 10.0,
    'objective': 'binary:logistic',
    'max_depth': 7,
    'max_delta_step': 1.8,
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'eta': 0.025,
    'gamma': 0.65,
    'num_boost_round' : 700
    }

In [None]:
X = train2.drop(['id', 'target'], axis=1).values
y = train2.target.values
X_test = test2.drop('id', axis=1)
sub=test2['id'].to_frame()
sub['target']=0

In [None]:
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print('[Fold %d/%d]' % (i + 1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    # Convert our data into XGBoost format
    d_train = xgb.DMatrix(X_train, y_train)
    d_valid = xgb.DMatrix(X_valid, y_valid)
    d_test = xgb.DMatrix(X_test.values)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    # Train the model! We pass in a max of 1,600 rounds (with early stopping after 70)
    # and the custom metric (maximize=True tells xgb that higher metric is better)
    mdl = xgb.train(params, d_train, 1600, watchlist, early_stopping_rounds=70, maximize=True, verbose_eval=100)

    print('[Fold %d/%d Prediciton:]' % (i + 1, kfold))
    # Predict on our test data
    p_test = mdl.predict(d_test, ntree_limit=mdl.best_ntree_limit)
    sub['target'] += p_test

sub['target'] = sub['target'] / kfold


In [None]:
sub.to_csv('StratifiedKFold.csv', index=False)