In [56]:
# pip install feature_engine
# pip install imblearn
# pip install gdown

import gdown
url = "https://drive.google.com/drive/u/1/folders/1RQuymJleFRULtzlSPOUOtA6ah1onITu-"
gdown.download_folder(url, quiet=True, use_cookies=False)

import numpy as np
import pandas as pd
from feature_engine.encoding import WoEEncoder
import random
from sklearn.linear_model import HuberRegressor, LogisticRegression
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score

from imblearn.over_sampling import SMOTE
import pickle

import warnings
warnings.filterwarnings("ignore")

#### Load data

In [57]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train_labels = train.pop('failure')
print(len(train_labels))
train_test = pd.concat([train, test])
display(train.head(2))
print(f'train.shape = {train.shape}')
print(f'test.shape = {test.shape}')
print(f'test.shape = {train_test.shape}')

26570


Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
0,0,A,80.1,material_7,material_8,9,5,7,8,4,...,20.155,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1
1,1,A,84.89,material_7,material_8,9,5,14,3,3,...,17.889,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057


train.shape = (26570, 25)
test.shape = (20775, 25)
test.shape = (47345, 25)


#### Compute missing columns for measurement 3 and 5

In [58]:
train_test['missing_3'] = train_test['measurement_3'].isna()
train_test['missing_5'] = train_test['measurement_5'].isna()
display(train_test.head(2))
print(f"missing 3 = {train_test['missing_3'].sum()}/{len(train_test['missing_3'])}")
print(f"missing 5 = {train_test['missing_5'].sum()}/{len(train_test['missing_5'])}")

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,missing_3,missing_5
0,0,A,80.1,material_7,material_8,9,5,7,8,4,...,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,False,False
1,1,A,84.89,material_7,material_8,9,5,14,3,3,...,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,False,False


missing 3 = 710/47345
missing 5 = 1184/47345


In [59]:
category_features = [ col for col in train.columns if col.startswith('attribute_')]
numerical_features = [ col for col in train.columns if col.startswith('measurement_') or col == 'loading']
print(f'category_features = {category_features}')
print(f'numerical_features = {numerical_features}')


category_features = ['attribute_0', 'attribute_1', 'attribute_2', 'attribute_3']
numerical_features = ['loading', 'measurement_0', 'measurement_1', 'measurement_2', 'measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7', 'measurement_8', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16', 'measurement_17']


#### Calculate the highest absolute correlations for each feature

In [60]:
fill_dict = {
    'A': ['measurement_5','measurement_6','measurement_8'],
    'B': ['measurement_4','measurement_5','measurement_7'],
    'C': ['measurement_5','measurement_7','measurement_8','measurement_9'],
    'D': ['measurement_5','measurement_6','measurement_7','measurement_8'],
    'E': ['measurement_4','measurement_5','measurement_6','measurement_8'],
    'F': ['measurement_4','measurement_5','measurement_6','measurement_7'],
    'G': ['measurement_4','measurement_6','measurement_8','measurement_9'],
    'H': ['measurement_4','measurement_5','measurement_7','measurement_8','measurement_9'],
    'I': ['measurement_3','measurement_7','measurement_8']
}

In [61]:
product_feature_corr = {}
for product_code in train_test['product_code'].unique():
    product_data = train_test[ train_test['product_code'] == product_code ]
    feature_corr = {}
    for n_feature in numerical_features:
        abs_corr = np.abs(product_data[numerical_features].corr()[n_feature])
        sort_abs_corr = abs_corr.sort_values(ascending=False)
        feature_corr[n_feature] = sort_abs_corr[1:5].index.to_list() #, sort_abs_corr[1:5].to_list())
    
    product_feature_corr[product_code] = feature_corr

for product_code in train_test['product_code'].unique():
    product_feature_corr[product_code]['measurement_17'] = fill_dict[product_code]


# print(product_feature_corr['A']['measurement_0'])
print(product_feature_corr['H']['measurement_17'])

['measurement_4', 'measurement_5', 'measurement_7', 'measurement_8', 'measurement_9']


#### Fill in Nan by HuberRegressor or KNNImputer

In [62]:

for product_code in train_test['product_code'].unique():
    product_data = train_test[ train_test['product_code'] == product_code ]
    for n_feature in numerical_features:
        corr_features = product_feature_corr[product_code][n_feature]
        if product_data[n_feature].isna().sum() != 0:
            print("Number of nan: ", product_data[n_feature].isna().sum())

            # HuberRegressor
            # target: clean, corr: clean
            huber_fit_data = product_data[ [n_feature] + corr_features].dropna(axis=0, how='any')
            # target: NaN, corr: clean
            huber_predict_data = product_data[ product_data[n_feature].isna() & (product_data[corr_features].isna().sum(axis=1) == 0)]
            print("Huber fill: ", huber_predict_data.shape[0])
            huber = HuberRegressor(epsilon=1.9)
            huber.fit(X=huber_fit_data[corr_features], y=huber_fit_data[n_feature])
            # Bottleneck: might not hold
            product_data[ product_data[n_feature].isna() & (product_data[corr_features].isna().sum(axis=1) == 0)][n_feature] = huber.predict(huber_predict_data[corr_features])  

    
    # KNNImputer
    imputer = KNNImputer(n_neighbors=20)
    product_data[numerical_features] = imputer.fit_transform(product_data[numerical_features])
    # print(product_data[numerical_features].isna().sum().sum())
    # Saved updated data to train_test
    train_test[ train_test['product_code'] == product_code ] = product_data


Number of nan:  49
Huber fill:  41
Number of nan:  68
Huber fill:  56
Number of nan:  102
Huber fill:  72
Number of nan:  151
Huber fill:  113
Number of nan:  175
Huber fill:  146
Number of nan:  176
Huber fill:  134
Number of nan:  187
Huber fill:  167
Number of nan:  225
Huber fill:  190
Number of nan:  257
Huber fill:  189
Number of nan:  296
Huber fill:  225
Number of nan:  326
Huber fill:  234
Number of nan:  330
Huber fill:  266
Number of nan:  316
Huber fill:  237
Number of nan:  377
Huber fill:  273
Number of nan:  388
Huber fill:  293
Number of nan:  426
Huber fill:  386
Number of nan:  41
Huber fill:  35
Number of nan:  84
Huber fill:  72
Number of nan:  95
Huber fill:  80
Number of nan:  115
Huber fill:  89
Number of nan:  133
Huber fill:  113
Number of nan:  193
Huber fill:  167
Number of nan:  212
Huber fill:  168
Number of nan:  251
Huber fill:  207
Number of nan:  267
Huber fill:  197
Number of nan:  287
Huber fill:  220
Number of nan:  298
Huber fill:  250
Number of nan

In [63]:
print(train_test.head())
print(train_test.shape)

   id product_code  loading attribute_0 attribute_1  attribute_2  attribute_3  \
0   0            A    80.10  material_7  material_8            9            5   
1   1            A    84.89  material_7  material_8            9            5   
2   2            A    82.43  material_7  material_8            9            5   
3   3            A   101.07  material_7  material_8            9            5   
4   4            A   188.06  material_7  material_8            9            5   

   measurement_0  measurement_1  measurement_2  ...  measurement_10  \
0              7              8              4  ...          15.859   
1             14              3              3  ...          17.947   
2             12              1              5  ...          15.607   
3             13              2              6  ...          16.346   
4              9              2              8  ...          17.082   

   measurement_11  measurement_12  measurement_13  measurement_14  \
0          17.594

#### Normalization

In [64]:
def normalized(data_df: pd.DataFrame, numerical_features: list):
    scaler = StandardScaler()
    normalized_data_df = data_df.copy()
    normalized_data_df[numerical_features] = scaler.fit_transform(data_df[numerical_features])
    # print(normalized_data_df.head(1))
    return normalized_data_df

normalized_train_test = pd.DataFrame(normalized(train_test, numerical_features), columns=train_test.columns)
normalized_train_test.head(1)

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,missing_3,missing_5
0,0,A,-1.224768,material_7,material_8,9,5,-0.103287,-0.129328,-0.619559,...,-0.180774,-0.944901,2.428971,-0.543637,-0.358955,-1.34843,-1.143312,0.516183,False,False


#### Split train, test back to original

In [65]:
normalized_train_test = normalized_train_test.astype({'attribute_2': object, 'attribute_3': object})
train_filled = normalized_train_test.iloc[: train.shape[0], :]
test_filled = normalized_train_test.iloc[train.shape[0] :, :]


#### Encode "object" type columns

In [66]:
# Encode only category_features
woe_encoder = WoEEncoder(variables=category_features)
train_filled = woe_encoder.fit_transform(X=train_filled, y=train_labels)
test_filled = woe_encoder.transform(X=test_filled)
train_filled.head(2)

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,missing_3,missing_5
0,0,A,-1.224768,0.017894,0.037537,0.085398,0.085398,-0.103287,-0.129328,-0.619559,...,-0.180774,-0.944901,2.428971,-0.543637,-0.358955,-1.34843,-1.143312,0.516183,False,False
1,1,A,-1.101624,0.017894,0.037537,0.085398,0.085398,1.568709,-1.299094,-0.901217,...,1.265554,-0.733789,-0.028075,-0.78898,-0.465462,-0.438603,-0.558464,-0.160361,False,False


#### Logistic regression & K fold

In [67]:
from operator import add

def weighted_answer(kfold_val_acc_lst):
    weighted_prob_lst = []
    for i in range(len(kfold_val_acc_lst[0])):
        # weighted_prob_lst.append(kfold_val_acc_lst[0][i]*0.3325 + kfold_val_acc_lst[1][i]*0.6675)
        weighted_prob_lst.append(kfold_val_acc_lst[0][i]*0.2 + kfold_val_acc_lst[1][i]*0.25 + kfold_val_acc_lst[2][i]*0.25 + kfold_val_acc_lst[3][i]*0.3)
    return weighted_prob_lst

In [68]:
num_split = 2
skf = StratifiedKFold(n_splits=num_split, shuffle=True, random_state=0)
sm = SMOTE(random_state = 42, n_jobs = -1)


# features = numerical_features
features1 = ['missing_3', 'missing_5', 'measurement_1', 'measurement_2', 'loading', 'measurement_17']
features2 = ['missing_3', 'missing_5', 'loading', 'measurement_17']
features3 = ['missing_3', 'missing_5', 'measurement_2', 'loading', 'measurement_17']
features4 = ['measurement_2', 'loading', 'measurement_17']
ensemble_features = [features1, features2, features3, features4]

feature_importance = []
kfold_val_prob_lst = []
for n, features in enumerate(ensemble_features):
    answer = np.zeros(test_filled.shape[0])
    kfold_val_acc = 0
    for i, (train_index, val_index) in enumerate(skf.split(train_filled, train_labels)):
        # Get index
        train_data = train_filled.iloc[train_index][features].reset_index(drop=True)
        train_label_data = train_labels[train_index]
        val_data = train_filled.iloc[val_index][features].reset_index(drop=True)
        val_label_data = train_labels[val_index]
        # SMOTE oversampling
        train_data, train_label_data = sm.fit_resample(train_data, train_label_data)
        # Logistic regerssion
        clf = LogisticRegression(max_iter=1000, C=0.0001, penalty='l2',solver='newton-cg')
        clf = pickle.load(open(f'./saved_model/clf{n}_{i}.pickle', "rb"))
        print(f"Successfully load clf{n}_{i}.pickle")
        # clf.fit(train_data, train_label_data)
        # pickle.dump(clf, open(f"./saved_model/clf{n}_{i}.pickle", "wb"))

        val_pred = clf.predict(val_data[features])
        val_acc = accuracy_score(val_label_data, val_pred)
        kfold_val_acc += val_acc / num_split
        print("val acc = ", round(val_acc,5))
        answer += clf.predict_proba(test_filled[features])[:, 1] / num_split

    kfold_val_prob_lst.append(answer)
    print(f"Avg accuracy = {round(kfold_val_acc, 5)}")
weighted_val_answer = weighted_answer(kfold_val_prob_lst)
weighted_val_binary_answer = [int(prob > 0.5) for prob in weighted_val_answer]
# val_acc = accuracy_score(val_label_data, weighted_val_pred)   answer or val
# print(f'weighted_ave_acc = {weighted_val_binary_answer}')

Successfully load clf0_0.pickle
val acc =  0.59541
Successfully load clf0_1.pickle
val acc =  0.59262
Avg accuracy = 0.59402
Successfully load clf1_0.pickle
val acc =  0.59691
Successfully load clf1_1.pickle
val acc =  0.59654
Avg accuracy = 0.59673
Successfully load clf2_0.pickle
val acc =  0.59578
Successfully load clf2_1.pickle
val acc =  0.59699
Avg accuracy = 0.59639
Successfully load clf3_0.pickle
val acc =  0.59518
Successfully load clf3_1.pickle
val acc =  0.59729
Avg accuracy = 0.59624


In [69]:
sub_log = pd.read_csv('sample_submission.csv')
sub_log['failure'] = weighted_val_answer

sub_log.to_csv("0816036.csv", index=False)


#### Load model

In [70]:
# loaded_model = pickle.load(open('saved_model\clf1_0.pickle', "rb"))
# loaded_model

In [71]:
# STOP

#### Combine sm and w/o sm predictions

In [72]:
# sub_59002 = pd.read_csv('submission_0.59017.csv')
# sub_59017 = pd.read_csv('submission_0.59017_0.33_002_0.67_023.csv')
# new_sub = sub_59002
# new_sub['failure'] = [value59002*0.5 + value59017*0.5 for value59002, value59017 in zip(sub_59002['failure'], sub_59017['failure'])]
# new_sub.to_csv("submission.csv", index=False)