In [1]:
import os
import gc
import time
import numpy as np
import pandas as pd
from contextlib import contextmanager
import multiprocessing as mp
from functools import partial
from scipy.stats import kurtosis, iqr, skew
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(name, time.time() - t0))

# -------------------------------------------Random Forest---------------------------------

In [8]:

def random_forest(data):
    
    df = data[data['TARGET'].notnull()]
    y = df["TARGET"].copy()
    test_df = data[data['TARGET'].isnull()]
    print("Train/valid shape: {}, test shape: {}".format(df.shape, test_df.shape))
    
    del_features = ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index', 'level_0']
    predictors = list(filter(lambda v: v not in del_features, df.columns))
    
    train=df[predictors].values
    test=test_df[predictors].values
    
    """
    train = train.fillna(train.mean(), inplace=True)
    test= test.fillna(test.mean(), inplace=True)
    
    train = (train - train.min())/(train.max() - train.min())
    test = (test - test.min())/(test.max() - test.min())"""
   
    # Make the random forest classifier
    random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)
    
    # Train on the training data
    random_forest.fit(train, y)
    
    # Extract feature importances
    feature_importance_values = random_forest.feature_importances_
    feature_importances = pd.DataFrame({'feature': predictors, 'importance': feature_importance_values})
    
    # Make predictions on the test data
    predictions = random_forest.predict_proba(test)[:, 1]
    
    # Make a submission dataframe
    submit = test_df[['SK_ID_CURR']]
    submit['TARGET'] = predictions

    # Save the submission dataframe
    submit.to_csv('RandomForestResult.csv', index = False)
    
    return feature_importances


# ------------------------------- MAIN-----------------------------------

In [4]:
def main():
    # choose input data
    data= pd.read_csv('../input/clean_data.csv', nrows= None)
    
    with timer("Run Random Forest"):
        feat_importance = random_forest(data)
        print(feat_importance)

# -----------------------run main------------------------

In [9]:
if __name__ == "__main__":
    pd.set_option('display.max_rows', 60)
    pd.set_option('display.max_columns', 100)
    with timer("Pipeline total time"):
        main()

Train/valid shape: (307506, 659), test shape: (48744, 659)


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').