In [1]:
import numpy as np
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
%matplotlib inline

In [2]:
def get_data_porto(perc_keep):
    # Import data and get rid of missings
    df = pd.read_csv('datasets/train.csv', na_values=[-1, ])
    # Get rid of columns with many missings > 10 %, ie mutiply N by 10/100
    criterion = len(df.index) / perc_keep
    mask = df.isnull().sum() < criterion
    df_clean = df.loc[:, mask]
    # Drop rows with missings (or use multiple imputation later)
    df_clean = df_clean.dropna()
    print('Final number of rows: {}'.format(len(df_clean.index)))
    print('Final number of columns: {}'.format(len(df_clean.columns)))
    return df_clean

In [3]:
df_clean = get_data_porto(10)

Final number of rows: 541860
Final number of columns: 56


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

def prepare_data(df, split_ratio):
    """Encodes, scales and splits current data."""
    
    # Extract data
    X = df_clean.drop(df_clean[['id', 'target']], axis=1)
    y = df_clean[['target']].values.ravel()
    
    # Split training/test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=split_ratio, random_state=42)
    
    # Get indices of particular columns
    categorical_features = [idx for idx, f in enumerate(X.columns) if 'cat' in f]
    binary_features = [idx for idx, f in enumerate(X.columns) if 'bin' in f]
    numerical_features = [idx for idx, f in enumerate(X.columns) if 'cat' not in f and 'bin' not in f]
    
    # Scale numeric data of training and test set
    sc = StandardScaler()
    X_train.iloc[:, numerical_features] = sc.fit_transform(X_train.iloc[:, numerical_features].values)
    X_test.iloc[:, numerical_features] = sc.transform(X_test.iloc[:, numerical_features].values)
    
    # One-hot encoding of categorical variables
    X_train = OneHotEncoder(categorical_features=categorical_features, sparse=False).fit_transform(X_train.values)
    X_test = OneHotEncoder(categorical_features=categorical_features, sparse=False).fit_transform(X_test.values)
    
    # Return in order of train_test_split
    return X_train, X_test, y_train, y_test
    
    

In [5]:
X_train, X_test, y_train, y_test = prepare_data(df_clean, 0.05)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [None]:
# Try balance cascade
from imblearn.ensemble import BalanceCascade
bc = BalanceCascade(random_state=42)
X_rs, y_rs = bc.fit_sample(X_train, y_train)

In [21]:
# Try informed undersampling
# Fails, since not enough memory for the algorithm
# from collections import Counter
# from imblearn.under_sampling import NearMiss 

# # nm = NearMiss(random_state=42, version=2, n_neighbors=5)
# # X_train_us, y_train_us = nm.fit_sample(X_train_s, y_train_s.toarray().ravel())

# # print('Resampled dataset shape {}'.format(Counter(y_train_us)))

In [22]:
# Try balanced bagging classifier
from imblearn.ensemble import BalancedBaggingClassifier 

In [52]:
classifiers = []
# Stack ten classifiers
for i in range(10):
    bbc = BalancedBaggingClassifier(verbose=True, n_jobs=-1, n_estimators=100, max_samples=0.5, max_features=int(np.sqrt(X_train.shape[1])), oob_score=True, bootstrap_features=True)
    bbc.fit(X_train, y_train) 
    classifiers.append(bbc)
    print(bbc.oob_score_)

[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   24.3s remaining:   24.3s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   27.0s finished


0.574422991373


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   21.7s remaining:   21.7s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   25.0s finished


0.573972302032


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   20.5s remaining:   20.5s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   23.2s finished


0.588180672032


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   20.9s remaining:   20.9s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   23.8s finished


0.577113529034


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   20.6s remaining:   20.6s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   23.3s finished


0.572198684065


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   20.6s remaining:   20.6s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   23.5s finished


0.575207812467


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   20.6s remaining:   20.6s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   23.3s finished


0.572082126477


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   20.6s remaining:   20.6s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   23.5s finished


0.587292891735


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   20.1s remaining:   20.1s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   23.1s finished


0.589429780852


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   21.1s remaining:   21.1s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   23.8s finished


0.576482175431


In [53]:
# get predictions
for classifier in classifiers:
    y_pred = classifier.predict(X_test)
    print('Confusion matrix :\n', confusion_matrix(y_test, y_pred))
    print('Precision ', precision_score(y_test, y_pred))
    print('Recall ', recall_score(y_test, y_pred))

[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.8s remaining:    1.8s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    2.6s finished


Confusion matrix :
 [[15365 10771]
 [  429   528]]
Precision  0.0467297990973
Recall  0.551724137931


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.9s remaining:    1.9s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    2.7s finished


Confusion matrix :
 [[15205 10931]
 [  432   525]]
Precision  0.0458275139665
Recall  0.548589341693


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.8s remaining:    1.8s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    2.7s finished


Confusion matrix :
 [[15790 10346]
 [  426   531]]
Precision  0.0488186080721
Recall  0.554858934169


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.9s remaining:    1.9s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    2.8s finished


Confusion matrix :
 [[15253 10883]
 [  437   520]]
Precision  0.0456020345523
Recall  0.543364681296


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    2.0s remaining:    2.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    2.8s finished


Confusion matrix :
 [[15246 10890]
 [  423   534]]
Precision  0.046743697479
Recall  0.557993730408


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.9s remaining:    1.9s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    2.9s finished


Confusion matrix :
 [[15409 10727]
 [  443   514]]
Precision  0.0457254692643
Recall  0.537095088819


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    2.0s remaining:    2.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    2.9s finished


Confusion matrix :
 [[15171 10965]
 [  426   531]]
Precision  0.0461899791232
Recall  0.554858934169


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.9s remaining:    1.9s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    2.7s finished


Confusion matrix :
 [[15493 10643]
 [  433   524]]
Precision  0.0469239724187
Recall  0.547544409613


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.8s remaining:    1.8s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    2.7s finished


Confusion matrix :
 [[15700 10436]
 [  451   506]]
Precision  0.0462438311095
Recall  0.528735632184


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.9s remaining:    1.9s


Confusion matrix :
 [[15316 10820]
 [  438   519]]
Precision  0.045771232031
Recall  0.542319749216


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    2.7s finished


In [47]:
# test predictions
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

print('Confusion matrix :\n', confusion_matrix(y_test, y_pred))
print('Precision ', precision_score(y_test, y_pred))
print('Recall ', recall_score(y_test, y_pred))

Confusion matrix :
 [[15422 10714]
 [  421   536]]
Precision  0.0476444444444
Recall  0.560083594566


In [48]:
def gini(solution, submission):
    df = zip(solution, submission, range(len(solution)))
    df = sorted(df, key=lambda x: (x[1],-x[2]), reverse=True)
    rand = [float(i+1)/float(len(df)) for i in range(len(df))]
    totalPos = float(sum([x[0] for x in df]))
    cumPosFound = [df[0][0]]
    for i in range(1,len(df)):
        cumPosFound.append(cumPosFound[len(cumPosFound)-1] + df[i][0])
    Lorentz = [float(x)/totalPos for x in cumPosFound]
    Gini = [Lorentz[i]-rand[i] for i in range(len(df))]
    return sum(Gini)

def normalized_gini(solution, submission):
    normalized_gini = gini(solution, submission)/gini(solution, solution)
    return normalized_gini

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   24.4s finished


array([[304575, 191999],
       [  1367,  16826]], dtype=int64)