In [2]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import pandas as pd
import numpy as np
import time
import lightgbm as lgb

In [3]:
def verbalise_dataset(train, test):
    print('Train shape:' + str(train.shape))
    print('Test shape:' + str(test.shape))
    print()

In [4]:
def load_file(filepath):

    start_time = time.time()
    df = pd.read_csv(filepath, low_memory=False)
    elapsed_time = time.time() - start_time
    print("Dataset loaded, time elapsed: " + str(elapsed_time))

    return df

train = load_file('../data/train.csv')  # (76020, 371)
test = load_file('../data/test.csv')  # (75818, 370)
verbalise_dataset(train, test)

Dataset loaded, time elapsed: 27.967193603515625
Dataset loaded, time elapsed: 26.628715991973877
Train shape:(76020, 371)
Test shape:(75818, 370)



In [5]:
def remove_duplicate_col(train, test):

    print('Removing duplicated features')
    output = []
    columns = train.columns  # list of headers
    for i in range(len(columns)-1):
        for j in range(i+1,len(columns)):
            if np.array_equal(train[columns[i]].values, train[columns[j]].values) and columns[j] not in output:
                    output.append(columns[j])
    
    train = train.drop(output, axis=1)
    test = test.drop(output, axis=1)

    return train, test


clean_train, clean_test = remove_duplicate_col(train, test)
verbalise_dataset(clean_train, clean_test)

Removing duplicated features
Train shape:(76020, 309)
Test shape:(75818, 308)



In [6]:
def remove_constant_col(train, test):

    print('Removing constant features')
    columns = []
    for col in train.columns:
        if train[col].std() == 0:
            columns.append(col)

    train = train.drop(columns, axis=1)
    test = test.drop(columns, axis=1)

    return train, test

clean_train, clean_test = remove_constant_col(clean_train, clean_test)
verbalise_dataset(clean_train, clean_test)

Removing constant features
Train shape:(76020, 308)
Test shape:(75818, 307)



In [13]:
# split data into train and test
X = clean_train.drop(["TARGET","ID"],axis=1)
Y = clean_train['TARGET'].values

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True)

test_id = test.ID
test = test.drop(["ID"],axis=1)
target = np.zeros(test.shape[0])

total_score = 0.0
for train_index, test_index in skf.split(X, Y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    plt.pie(Y_test)
    
    d_train = lgb.Dataset(X_train, label=Y_train)   # (60816, 306) (15204, 306)
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
    }
    clf = lgb.train(train_set=d_train, params=params)
    
    Y_pred=clf.predict(X_test)
    score = roc_auc_score(Y_test, Y_pred)
    total_score += score
    print("Score: " + str(score))
    
    probs = clf.predict(test)
    target += probs / n_splits

average_score = total_score / n_splits
print("Average training score: " + str(average_score))

submission = pd.DataFrame({"ID":test_id, "TARGET": target})
submission.to_csv("submission.csv", index=False)

AttributeError: 'DataFrame' object has no attribute 'ID'