In [None]:
from sklearn.feature_selection import SelectFromModel, RFE
from IPython.display import clear_output
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix,precision_recall_curve,auc,roc_auc_score,roc_curve,recall_score,classification_report,accuracy_score,f1_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier,plot_importance
from sklearn.preprocessing import Imputer, MinMaxScaler, StandardScaler

# sampler objects
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, SVMSMOTE
from imblearn.under_sampling import RandomUnderSampler

import pandas as pd
import os
import numpy as np
from scipy import stats
import pandas_profiling as pp
import itertools
scaler = StandardScaler()

# ignore warnings
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)
#%% ------path
data_path = ''
data_path_train = os.path.join(data_path,'train/train')
file = os.path.join(data_path,'train_kaggle.csv')
pro_file = os.path.join(data_path,'train_kaggle_pro.csv')
test_pro_file = os.path.join(data_path,'test_kaggle_pro.csv')
data_path_test = os.path.join(data_path,'test/test')

df = pd.read_csv(file)

# Load Data

In [None]:
data_raw = []

s = len(df)
for i in range(0,s):
    print("Loading in progess... %.1f%%" % (i/s * 100.0))
    data_raw.append(np.load(os.path.join(data_path_train,str(i)+'.npy')))
    clear_output()
print(len(data_raw), " data loaded.")

# Pre processing

In [None]:
# Pre processing of Training data

# padding each data to length of 336, because the longest data length for 1 and 0 are both 336
f_n = 336 
# calculate min/max/var value for below features
mm = [1,3,5,7,8,9,11,12,13,14,15,17,18,19,20,21,22,23,24,26,27,28,29,30,31,32,35,36,37,38,39] 
doc = open(pro_file,'w',encoding='utf-8')

string = ",".join(map(str, [i for i in range(1, 39*f_n+2+len(mm)*3)])) + ',label\n'
doc.write(string)

s = len(data_raw)
for i in range(s):
    print("Processing in progess... %.1f%%" % (i/s * 100.0))
    label = df.iloc[i]['label']
    feature = data_raw[i]
    m = []
    for j in mm:
        m = np.concatenate((m, np.nanmin(feature.T[j]), 
                            np.nanmax(feature.T[j]), 
                            np.nanvar(feature.T[j])), axis=None)
    m = np.concatenate((m, np.nanmax(feature.T[2])), axis=None)
    feature = np.delete(feature, 2, axis=1)  
    # normalize data
    feature = scaler.fit_transform(feature)
    # padding
    if(len(feature) < f_n):
        feature = np.vstack([feature, np.full((f_n - len(feature), 39), np.nan)])
    f_d = pd.DataFrame(feature)
    if(len(f_d) > f_n):
        f_d = f_d[:f_n]
    # linear interpolate
    f_d.interpolate(method='linear', limit_direction='forward', axis=0, inplace=True)
    l = []
    f_d = f_d.T
    string = ''
    for j, k in f_d.iterrows():
        l.append(list(k))
    l.append(list(m))
    merged = list(itertools.chain.from_iterable(l))
    string = ",".join(map(str,merged)) + "," + str(label) + '\n'
    doc.write(string)
    clear_output()
doc.close()

data = pd.read_csv(pro_file)
data.head(10)

In [None]:
# Pre processing of Test data

f_n = 336
mm = [1,3,5,7,8,9,11,12,13,14,15,17,18,19,20,21,22,23,24,26,27,28,29,30,31,32,35,36,37,38,39]
doc = open(test_pro_file,'w',encoding='utf-8')

string = ",".join(map(str, [i for i in range(1, 39*f_n+2+len(mm)*3)])) + '\n'
doc.write(string)

for i in range(0,10000):
    print("Processing in progess... %.1f%%" % (i/10000 * 100.0))
    feature = np.load(os.path.join(data_path_test,str(i)+'.npy'))
    string = '\n'
    m = []
    for j in mm:
        m = np.concatenate((m, np.nanmin(feature.T[j]), 
                            np.nanmax(feature.T[j]), 
                            np.nanvar(feature.T[j])), axis=None)
    m = np.concatenate((m, np.nanmax(feature.T[2])), axis=None)
    feature = np.delete(feature, 2, axis=1)  
    feature = min_max_scaler.fit_transform(feature)
    if(len(feature) < f_n):
        feature = np.vstack([feature, np.full((f_n - len(feature), 39), np.nan)])
    f_d = pd.DataFrame(feature)
    if(len(f_d) > f_n):
        f_d = f_d[:f_n]
    f_d.interpolate(method='linear', limit_direction='forward', axis=0, inplace=True)
    l = []
    f_d = f_d.T
    for j, k in f_d.iterrows():
        l.append(list(k))
    l.append(list(m))
    merged = list(itertools.chain.from_iterable(l))
    string = string + ",".join(map(str,merged))
    doc.write(string)
    clear_output()
doc.close()

test_data = pd.read_csv(test_pro_file)
test_data.head(10)

# Feature Selection 

In [None]:
# Select 3000 features for training
embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=3000)
embeded_rf_selector.fit(X, y)

embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()

# Split data into traning set and testing set

In [None]:
y = data[['label']]
X = data.drop(columns=['label'])

X = X[embeded_rf_feature]

X.fillna(-1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 42)

# Over/Under sample data functions

In [None]:
def oversample_data(ratio = 0.5):
    data = pd.read_csv("train_kaggle_pro.csv")

    train, test = train_test_split(data, test_size=0.3)
    y_test = test[['label']]
    x_test = test.drop(columns=['label'])
    
    y_train = train[['label']]
    x_train = train.drop(columns=['label'])
    
    x_test.fillna(-1, inplace=True)
    x_train.fillna(-1, inplace=True)
    X_train_oversample, y_train_oversample = SMOTE(sampling_strategy=ratio).fit_resample(x_train, y_train)    
    
    ttt = pd.DataFrame(y_train_oversample)
    print("Percentage of 0: ", len(ttt[ttt[0] == 0])/len(ttt))
    print("Percentage of 1: ", len(ttt[ttt[0] == 1])/len(ttt))
    print("Total number of resampled data: ", len(ttt))
    X_train_oversample = pd.DataFrame(X_train_oversample)
    X_train_oversample.columns = X_test.columns
    return X_train_oversample, x_test, pd.DataFrame(y_train_oversample), y_test

def oversample_resampled_data(X, y, ratio = 0.5):
    X.fillna(-1, inplace=True)
    X_resampled, y_resampled = SMOTE(sampling_strategy=ratio).fit_resample(X, y)
    
    ttt = pd.DataFrame(y_resampled)
    print("Percentage of 0: ", len(ttt[ttt[0] == 0])/len(ttt))
    print("Percentage of 1: ", len(ttt[ttt[0] == 1])/len(ttt))
    print("Total number of resampled data: ", len(ttt))
    return X_resampled, y_resampled

def resampled_data():
    rus = RandomUnderSampler(random_state=42, sampling_strategy=0.2)
    X_resampled, y_resampled = rus.fit_resample(X, y)
    return X_resampled, y_resampled

def umdersample_data():
    X_resampled, y_resampled = resampled_data()

    ttt = pd.DataFrame(y_resampled)
    print("Percentage of 0: ", len(ttt[ttt[0] == 0])/len(ttt))
    print("Percentage of 1: ", len(ttt[ttt[0] == 1])/len(ttt))
    print("Total number of resampled data: ", len(ttt))
    
    X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_resampled,y_resampled,test_size = 0.3,random_state = 0)    
    return X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample

# Traning Model - LightGBM

In [None]:
eval_set = [(X_test, y_test)]
clf = lgb.LGBMRegressor(boosting_type="goss", 
                        num_iterations = 145, 
                        objective='binary',
                        metric='auc',
                        learning_rate=0.09,
                        num_threads=2,
                        is_unbalance=True)
clf.fit(X_train,y_train, eval_set=eval_set, eval_metric="auc", verbose=1, early_stopping_rounds=20)
y_pred_prob = clf.predict(X_test)
y_pred = np.where(y_pred_prob > 0.5, 1, 0)
recall_acc = recall_score(y_test,y_pred)
print("Recall: %.2f%%" % (recall_acc * 100.0))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
roc_auc = roc_auc_score(y_test, y_pred)
print(roc_auc)

In [None]:
param_test1 = {
    'max_depth': [4,5,6,7,8,9],
 'n_estimators': [200,300,400]
}
gsearch1 = GridSearchCV(estimator = clf, param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(X_train,y_train)
gsearch1.best_params_

In [None]:
data = pd.read_csv("train_kaggle_pro.csv")

y = data[['label']]
X = data.drop(columns=['label'])
X = X[embeded_rf_feature]
X.fillna(-1, inplace=True)
X.head()

# Prediction

In [None]:
#X_train_oversample, y_train_oversample = SMOTE(sampling_strategy=0.2).fit_resample(X, y) 
clf_pre = lgb.LGBMRegressor(boosting_type="goss", 
                            num_iterations = 150,
                            objective='binary',
                            metric='auc',
                            num_threads=2,
                            learning_rate=0.09,
                            is_unbalance=True)
clf_pre.fit(X,y)

In [None]:
test_data = pd.read_csv("test_kaggle_pro.csv")
test_data.fillna(-1, inplace=True)
test_data = test_data[embeded_rf_feature]

In [None]:
y_test_pred = clf.predict(test_data)
df_y = pd.DataFrame(y_test_pred, columns=["label"])
idx = pd.Series(range(0, len(df_y)))
upload = pd.concat([idx, df_y], axis=1)

upload.columns=['Id','Predicted']
upload.to_csv('submiss.csv', index=False)