In [1]:
import os
import gc
import time
import numpy as np
import pandas as pd
from contextlib import contextmanager
import multiprocessing as mp
from functools import partial
from scipy.stats import kurtosis, iqr, skew
from xgboost import XGBClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(name, time.time() - t0))

# -------------------------------------------XGBoost---------------------------------

In [7]:
def xgboost(data):
    
    df = data[data['TARGET'].notnull()]
    y = df["TARGET"].copy()
    #y = df["TARGET"].values
    test_df = data[data['TARGET'].isnull()]
    
    print("Train/valid shape: {}, test shape: {}".format(df.shape, test_df.shape))
    
    del_features = ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index', 'level_0']
    predictors = list(filter(lambda v: v not in del_features, df.columns))
    
    train=df[predictors].values
    test=test_df[predictors].values

    my_model = XGBClassifier(objective = 'binary:logistic',
              booster = 'gbtree',
              eval_metric = 'auc',
              nthread = 4,
              eta = 0.1, # 0.05
              max_depth = 8,
              min_child_weight = 60,
              gamma = 0,
              subsample = 0.7, # 0.8715623
              colsample_bytree = 0.7,
              colsample_bylevel = 0.632,
              reg_alpha = 0.041545473,
              reg_lambda = 0.0735294,
              nrounds = 2000,
              seed = 1337)
    # Add silent=True to avoid printing out updates with each cycle
    my_model.fit(train, y, verbose=False)
    
    """
    # plot feature importance
    plot_importance(model)
    pyplot.show()"""
    
    # make predictions
    predictions = my_model.predict_proba(test)[:, 1]

    # Submission dataframe
    submit = test_df[['SK_ID_CURR']]
    submit['TARGET'] = predictions

    # Save the submission to a csv file
    submit.to_csv('XGBoostResult.csv', index = False)
   


# ------------------------------- MAIN-----------------------------------

In [4]:
def main():
    # choose input data
    data= pd.read_csv('../input/clean_data.csv', nrows= None)
    
    with timer("Run Random Forest"):
        feat_importance = xgboost(data)
      

# -----------------------run main------------------------

In [8]:
if __name__ == "__main__":
    pd.set_option('display.max_rows', 60)
    pd.set_option('display.max_columns', 100)
    with timer("Pipeline total time"):
        main()

Train/valid shape: (307506, 659), test shape: (48744, 659)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Run Random Forest - done in 506s
Pipeline total time - done in 591s
