In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import pandas as pd
import numpy as np
import time
import lightgbm as lgb

In [2]:
def verbalise_dataset(train, test):
    print('Train shape:' + str(train.shape))
    print('Test shape:' + str(test.shape))
    print()

In [3]:
def load_file(filepath):

    start_time = time.time()
    df = pd.read_csv(filepath, low_memory=False)
    elapsed_time = time.time() - start_time
    print("Dataset loaded, time elapsed: " + str(elapsed_time))

    return df

train = load_file('../data/train.csv')  # (76020, 371)
test = load_file('../data/test.csv')  # (75818, 370)
verbalise_dataset(train, test)

Dataset loaded, time elapsed: 27.38778281211853
Dataset loaded, time elapsed: 27.817090034484863
Train shape:(76020, 371)
Test shape:(75818, 370)



In [4]:
def remove_duplicate_col(train, test):

    print('Removing duplicated features')
    output = []
    columns = train.columns  # list of headers
    for i in range(len(columns)-1):
        for j in range(i+1,len(columns)):
            if np.array_equal(train[columns[i]].values, train[columns[j]].values) and columns[j] not in output:
                    output.append(columns[j])
    
    train = train.drop(output, axis=1)
    test = test.drop(output, axis=1)

    return train, test


clean_train, clean_test = remove_duplicate_col(train, test)
verbalise_dataset(clean_train, clean_test)

Removing duplicated features
Train shape:(76020, 309)
Test shape:(75818, 308)



In [5]:
def remove_constant_col(train, test):

    print('Removing constant features')
    columns = []
    for col in train.columns:
        if train[col].std() == 0:
            columns.append(col)

    train = train.drop(columns, axis=1)
    test = test.drop(columns, axis=1)

    return train, test

clean_train, clean_test = remove_constant_col(clean_train, clean_test)
verbalise_dataset(clean_train, clean_test)

Removing constant features
Train shape:(76020, 308)
Test shape:(75818, 307)



In [6]:
# split data into train and test
X = clean_train.drop(["TARGET","ID"],axis=1)
Y = clean_train['TARGET'].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=1632)
print(X_train.shape, X_test.shape)

(60816, 306) (15204, 306)


In [7]:
d_train = lgb.Dataset(X_train, label=Y_train)
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
}
clf = lgb.train(train_set=d_train, params=params)

In [8]:
Y_pred=clf.predict(X_test)
print("Score: " + str(roc_auc_score(Y_test, Y_pred)))

Score: 0.8379656444267355
