In [1]:
import pickle
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sn
from scipy.stats import rankdata
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import KNNImputer
from sklearn.linear_model import HuberRegressor, LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler

warnings.filterwarnings('ignore')

In [2]:
# Read csv
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [3]:
def preprocessing(df_train, df_test):
    data = pd.concat([df_train, df_test])
    
    data['m3_missing'] = data['measurement_3'].isnull().astype(np.int8)
    data['m5_missing'] = data['measurement_5'].isnull().astype(np.int8)
    data['area'] = data['attribute_2'] * data['attribute_3']
    # data['loading'] = np.log(data['loading'])

    # Select the variables used to generate null value of measurement_17 for each product code
    # e.g. for product 'A', NaN values in measurement_17 is filled by a linear combination of (measurement_5, measurement_6, measurement_8)
    full_fill_dict = {
        'A': ['measurement_5','measurement_6','measurement_8'],
        'B': ['measurement_4','measurement_5','measurement_7'],
        'C': ['measurement_5','measurement_7','measurement_8','measurement_9'],
        'D': ['measurement_5','measurement_6','measurement_7','measurement_8'],
        'E': ['measurement_4','measurement_5','measurement_6','measurement_8'],
        'F': ['measurement_4','measurement_5','measurement_6','measurement_7'],
        'G': ['measurement_4','measurement_6','measurement_8','measurement_9'],
        'H': ['measurement_4','measurement_5','measurement_7','measurement_8','measurement_9'],
        'I': ['measurement_3','measurement_7','measurement_8']
    }


    features = [f for f in test.columns if 'measurement' in f] + ['loading']
    for code in data['product_code'].unique(): # ('A', 'B', ..., 'I')
        tmp = data[data['product_code'] == code]
        column = full_fill_dict[code]
        
        # the data used to train the HuberRegressor, so all variables & `measurement_17` should exist <=> drop all NaN values
        tmp_train = tmp[column + ['measurement_17']].dropna(how='any')
        
        # tmp_test = tmp[(all variables in `full_fill_dict[code]` are not NaN) & (the `measurement_17` is NaN in the row of data)]
        tmp_test = tmp[(tmp[column].isnull().sum(axis=1) == 0) & (tmp['measurement_17'].isnull())]
        print(f"code {code} has {len(tmp_test)} samples to fill nan")
        
        
        model = HuberRegressor()
        model.fit(tmp_train[column], tmp_train['measurement_17'])
        # Fill the missing `measurement_17` values by the trained model
        data.loc[(data['product_code'] == code) & (data[column].isnull().sum(axis=1) == 0) & (data['measurement_17'].isnull()), 'measurement_17'] = model.predict(tmp_test[column])
        
        # Other missing values are filled by KNNImputer
        model2 = KNNImputer(n_neighbors=5)
        print(f"KNN imputing code {code}")
        data.loc[(data['product_code'] == code), features] = model2.fit_transform(data.loc[(data['product_code'] == code), features])
    
    data['measurement_avg'] = data[[f'measurement_{i}' for i in range(3, 17)]].mean(axis=1)
    
    # [:df_train.shape[0], :] = rows[0:df_train.shape[0]] = [0, df_train.shape[0])
    # [df_train.shape[0]:, :] = rows[df_train.shape[0]:len(rows)+1] = [df_train.shape[0], len(rows)]
    df_train = data.iloc[:df_train.shape[0], :]
    df_test = data.iloc[df_train.shape[0]:, :]
    
    # woe_encoder = WoEEncoder(variables=['attribute_0'])
    # woe_encoder.fit(df_train, df_train['failure'])
    # df_train = woe_encoder.transform(df_train)
    # df_test = woe_encoder.transform(df_test)
    encoder = LabelEncoder()
    df_train['attribute_0'] = encoder.fit_transform(df_train['attribute_0'])
    df_test['attribute_0'] = encoder.fit_transform(df_test['attribute_0'])
    
    return df_train, df_test

df_train, df_test = preprocessing(train, test)

code A has 386 samples to fill nan
KNN imputing code A
code B has 418 samples to fill nan
KNN imputing code B
code C has 391 samples to fill nan
KNN imputing code C
code D has 398 samples to fill nan
KNN imputing code D
code E has 429 samples to fill nan
KNN imputing code E
code F has 420 samples to fill nan
KNN imputing code F
code G has 373 samples to fill nan
KNN imputing code G
code H has 361 samples to fill nan
KNN imputing code H
code I has 377 samples to fill nan
KNN imputing code I


In [4]:
def test_scale(test_data, feats):
    scaler = StandardScaler()
    
    scaled_test = scaler.fit_transform(test_data[feats])
    
    #back to dataframe
    new_test = test_data.copy()
    
    new_test[feats] = scaled_test
    
    assert len(test_data) == len(new_test)
    
    return new_test

In [5]:
X = df_train.drop(['failure'], axis=1)
y = df_train.failure # y = df_train['failure']

In [6]:
# Not sure why `m5_missing` is not selected from the code above, but it has greater influence than other features
select_features = ['loading', 'measurement_17', 'm5_missing', 'attribute_0']

In [7]:
# kf = GroupKFold(n_splits=5)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
lr_oof_1 = np.zeros(len(train))
lr_oof_2 = np.zeros(len(train))
lr_test = np.zeros(len(test))
lr_auc = 0
lr_acc = 0
importance_list = []
features = select_features

filename = 'Model0.sav'
model = pickle.load(open(filename, 'rb'))
# for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X, y, groups=X['product_code'])):
# Cross-Validation for 4 Logistic Regression Model
for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f'########################## {fold_idx} ##########################')
    # iloc : indexed_location (by int index ('int'))
    # loc : location (by feature name ('object'))
    x_test = df_test.copy()
    
    # Use StandardScaler to standardize the data
    x_test = test_scale(x_test, features)
        
    lr_test += model.predict_proba(x_test[features])[:, -1] / 5

print(f'############# End of Classifier #############')
    
submission['lr0'] = lr_test
submission['rank0'] = rankdata(submission['lr0'])
submission['failure'] = submission['rank0']

########################## 0 ##########################
########################## 1 ##########################
########################## 2 ##########################
########################## 3 ##########################
########################## 4 ##########################
############# End of Classifier #############


In [8]:
submission[['id', 'failure']].to_csv('submission.csv', index=False)