In [8]:
# !pip install feature-engine

In [9]:
import pickle
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sn
from colorama import Back, Fore, Style
from feature_engine.encoding import WoEEncoder
from scipy.stats import rankdata
from sklearn.impute import KNNImputer
from sklearn.linear_model import (HuberRegressor, LinearRegression,
                                  LogisticRegression)
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler

warnings.filterwarnings('ignore')

In [10]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [11]:
def preprocessing(df_train, df_test):
    data = pd.concat([df_train, df_test])
    
    data['m3_missing'] = data['measurement_3'].isnull().astype(np.int8)
    data['m5_missing'] = data['measurement_5'].isnull().astype(np.int8)
    data['area'] = data['attribute_2'] * data['attribute_3']
    data['loading'] = np.log(data['loading'])

    full_fill_dict = {
        'A': ['measurement_5','measurement_6','measurement_8'],
        'B': ['measurement_4','measurement_5','measurement_7'],
        'C': ['measurement_5','measurement_7','measurement_8','measurement_9'],
        'D': ['measurement_5','measurement_6','measurement_7','measurement_8'],
        'E': ['measurement_4','measurement_5','measurement_6','measurement_8'],
        'F': ['measurement_4','measurement_5','measurement_6','measurement_7'],
        'G': ['measurement_4','measurement_6','measurement_8','measurement_9'],
        'H': ['measurement_4','measurement_5','measurement_7','measurement_8','measurement_9'],
        'I': ['measurement_3','measurement_7','measurement_8']
    }

    features = [f for f in test.columns if 'measurement' in f] + ['loading']
    for code in data.product_code.unique():
        tmp = data[data.product_code==code]
        column = full_fill_dict[code]
        tmp_train = tmp[column+['measurement_17']].dropna(how='any')
        tmp_test = tmp[(tmp[column].isnull().sum(axis=1)==0)&(tmp['measurement_17'].isnull())]
        print(f"code {code} has {len(tmp_test)} samples to fill nan")
        model = HuberRegressor()
        model.fit(tmp_train[column], tmp_train['measurement_17'])
        data.loc[(data.product_code==code)&(data[column].isnull().sum(axis=1)==0)&(data['measurement_17'].isnull()), 'measurement_17'] = model.predict(tmp_test[column])
        model2 = KNNImputer(n_neighbors=5)
        print(f"KNN imputing code {code}")
        data.loc[data.product_code==code, features] = model2.fit_transform(data.loc[data.product_code==code, features])
    
    data['measurement_avg'] = data[[f'measurement_{i}' for i in range(3, 17)]].mean(axis=1)
    
    df_train = data.iloc[:df_train.shape[0], :]
    df_test = data.iloc[df_train.shape[0]:, :]
    
    woe_encoder = WoEEncoder(variables=['attribute_0'])
    woe_encoder.fit(df_train, df_train['failure'])
    df_train = woe_encoder.transform(df_train)
    df_test = woe_encoder.transform(df_test)

    return df_train, df_test

df_train, df_test = preprocessing(train, test)

code A has 386 samples to fill nan
KNN imputing code A
code B has 418 samples to fill nan
KNN imputing code B
code C has 391 samples to fill nan
KNN imputing code C
code D has 398 samples to fill nan
KNN imputing code D
code E has 429 samples to fill nan
KNN imputing code E
code F has 420 samples to fill nan
KNN imputing code F
code G has 373 samples to fill nan
KNN imputing code G
code H has 361 samples to fill nan
KNN imputing code H
code I has 377 samples to fill nan
KNN imputing code I


In [12]:
def test_scale(test_data, feats):
    scaler = StandardScaler()

    scaled_test = scaler.fit_transform(test_data[feats])
    
    #back to dataframe
    new_test = test_data.copy()
    
    new_test[feats] = scaled_test
    
    assert len(test_data) == len(new_test)
    
    return new_test

In [13]:
select_features = {
    0: ['loading', 'measurement_17', 'area', 'm5_missing', 'm3_missing', 'attribute_0'],
    1: ['loading', 'measurement_17', 'area', 'attribute_0'],
    2: ['loading', 'measurement_17', 'area', 'm3_missing', 'm5_missing'],
    3: ['loading', 'measurement_17', 'measurement_4']
}

In [14]:
x_test = df_test.copy()
for clf_k in range(4):
    filename = f'Model{clf_k}.sav'
    current_model = pickle.load(open(filename, 'rb'))
    print(f'############## Classifier {clf_k} #################')
    features = select_features[clf_k]
    x_test = test_scale(x_test, features)
    lr_test = current_model.predict_proba(x_test[features])[:, -1]
    submission[f'lr{clf_k}'] = lr_test

############## Classifier 0 #################
############## Classifier 1 #################
############## Classifier 2 #################
############## Classifier 3 #################


In [15]:
submission['rank0'] = rankdata(submission['lr0'])
submission['rank1'] = rankdata(submission['lr1'])
submission['rank2'] = rankdata(submission['lr2'])
submission['rank3'] = rankdata(submission['lr3'])

In [16]:
submission['failure'] = submission['rank0']*0.2 + submission['rank1']*0.25 + submission['rank2']*0.25 + submission['rank3']*0.3

In [17]:
submission.head()

Unnamed: 0,id,failure,lr0,lr1,lr2,lr3,rank0,rank1,rank2,rank3
0,26570,9314.5,0.209299,0.209417,0.210856,0.211499,8699.0,8717.0,9631.0,9959.0
1,26571,6187.7,0.204278,0.204394,0.205806,0.204075,6053.0,6041.0,6833.0,5862.0
2,26572,7977.8,0.207035,0.20715,0.208579,0.208823,7512.0,7520.0,8316.0,8388.0
3,26573,7064.25,0.205507,0.205623,0.207042,0.206711,6712.0,6703.0,7518.0,7222.0
4,26574,20169.75,0.2409,0.241036,0.242621,0.239694,20128.0,20201.0,20264.0,20093.0


In [18]:
submission[['id', 'failure']].to_csv('submission.csv', index=False)