In [1]:
!pip install feature_engine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import numpy as np
import pickle
import pandas as pd
from dataclasses import dataclass
from sklearn.impute import KNNImputer
from feature_engine.encoding import WoEEncoder
from sklearn.linear_model import HuberRegressor
from sklearn.metrics import roc_auc_score

In [3]:
# Mount to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
root = '/content/drive/MyDrive/ML_final'
TRAIN_FILE = 'train.csv'
TEST_FILE = 'test.csv'
SUB_FILE = 'sample_submission.csv'
MODEL_FILE = 'models.pckl'

In [5]:
train_df = pd.read_csv(root + '/' + TRAIN_FILE)
test_df = pd.read_csv(root + '/' + TEST_FILE)
submission = pd.read_csv(root + '/' + SUB_FILE)

In [6]:
# Reference: https://www.kaggle.com/code/medali1992/tps-aug-logistic-regression

def preprocessing(df_train, df_test):
    data = pd.concat([df_train, df_test])

    # Create three new attributes: m3_missing, m5_missing and area
    data['m3_missing'] = data['measurement_3'].isnull().astype(np.int8)
    data['m5_missing'] = data['measurement_5'].isnull().astype(np.int8)
    data['area'] = data['attribute_2'] * data['attribute_3']

    feature = [f for f in df_test.columns
               if f.startswith('measurement') or f == 'loading']

    # dictionnary of dictionnaries
    # (for the 11 best correlated measurement columns),
    # we will use the dictionnaries below to select
    # the best correlated columns according to the product code
    # Only for 'measurement_17' we make a 'manual' selection :

    full_fill_dict = {}
    full_fill_dict['measurement_17'] = {
        'A': ['measurement_5', 'measurement_6',
              'measurement_8'],
        'B': ['measurement_4', 'measurement_5',
              'measurement_7'],
        'C': ['measurement_5', 'measurement_7',
              'measurement_8', 'measurement_9'],
        'D': ['measurement_5', 'measurement_6',
              'measurement_7', 'measurement_8'],
        'E': ['measurement_4', 'measurement_5',
              'measurement_6', 'measurement_8'],
        'F': ['measurement_4', 'measurement_5',
              'measurement_6', 'measurement_7'],
        'G': ['measurement_4', 'measurement_6',
              'measurement_8', 'measurement_9'],
        'H': ['measurement_4', 'measurement_5',
              'measurement_7', 'measurement_8',
              'measurement_9'],
        'I': ['measurement_3', 'measurement_7',
              'measurement_8']
    }

    # collect the name of the next 10 best measurement columns
    # sorted by correlation (except 17 already done above):
    col = [col for col in df_test.columns if 'measurement' not in col] + \
          ['loading', 'm3_missing', 'm5_missing']

    a = []
    b = []

    for x in range(3, 17):
        corr = np.absolute(data.drop(col, axis=1).corr()[f'measurement_{x}']).sort_values(ascending=False)
        # we add the 3 first lines of the correlation values to get the "most correlated"
        a.append(np.round(np.sum(corr[1:4]), 3))
        b.append(f'measurement_{x}')

    c = pd.DataFrame()
    c['Selected columns'] = b
    c['correlation total'] = a
    c = c.sort_values(by='correlation total', ascending=False).reset_index(drop=True)
    print(f'Columns selected by correlation sum of the 3 first rows: ')
    display(c.head(10))

    for i in range(10):
        measurement_col = 'measurement_' + c.iloc[i, 0][12:]  # we select the next best correlated column
        fill_dict = {}
        for x in data.product_code.unique() : 
            corr = np.absolute(data[data.product_code == x].drop(col, axis=1).corr()[measurement_col]).sort_values(ascending=False)
            measurement_col_dic = {}
            measurement_col_dic[measurement_col] = corr[1:5].index.tolist()
            fill_dict[x] = measurement_col_dic[measurement_col]
        full_fill_dict[measurement_col] = fill_dict

    feature = [f for f in data.columns if f.startswith('measurement') or f=='loading']
    print('failure' in feature)
    nullValue_cols = [col for col in df_train.columns if df_train[col].isnull().sum()!=0]

    for code in data.product_code.unique():
        total_na_filled_by_linear_model = 0
        print(f'\n-------- Product code {code} ----------\n')
        print(f'filled by linear model :')
        for measurement_col in list(full_fill_dict.keys()):
            tmp = data[data.product_code == code]
            column = full_fill_dict[measurement_col][code]
            tmp_train = tmp[column+[measurement_col]].dropna(how='any')
            tmp_test = tmp[(tmp[column].isnull().sum(axis=1)==0)&(tmp[measurement_col].isnull())]

            model = HuberRegressor(epsilon=1.9)
            model.fit(tmp_train[column], tmp_train[measurement_col])
            data.loc[(data.product_code==code)&(data[column].isnull().sum(axis=1)==0)&(data[measurement_col].isnull()),measurement_col] = model.predict(tmp_test[column])
            print(f'{measurement_col} : {len(tmp_test)}')
            total_na_filled_by_linear_model += len(tmp_test)

        # others NA columns:
        NA = data.loc[data["product_code"] == code,nullValue_cols ].isnull().sum().sum()
        model1 = KNNImputer(n_neighbors=3)
        data.loc[data.product_code==code, feature] = model1.fit_transform(data.loc[data.product_code==code, feature])
        print(f'\n{total_na_filled_by_linear_model} filled by linear model ') 
        print(f'{NA} filled by KNN ')

    data['measurement_avg'] = data[[f'measurement_{i}' for i in range(3, 17)]].mean(axis=1)
    data['measurement_std'] = data[[f'measurement_{i}' for i in range(3, 17)]].std(axis=1)
    data['measurement_median'] = data[[f'measurement_{i}' for i in range(3, 17)]].median(axis=1)
    data['measurement_max'] = data[[f'measurement_{i}' for i in range(3, 17)]].max(axis=1)
    data['measurement_min'] = data[[f'measurement_{i}' for i in range(3, 17)]].min(axis=1)
    data['measurement_skew'] = data[[f'measurement_{i}' for i in range(3, 17)]].skew(axis=1)
    
    
    
    df_train = data.iloc[:df_train.shape[0],:]
    df_test = data.iloc[df_train.shape[0]:,:]

    woe_encoder = WoEEncoder(variables=['attribute_0'])
    woe_encoder.fit(df_train, df_train['failure'])
    df_train = woe_encoder.transform(df_train)
    df_test = woe_encoder.transform(df_test)

    features = ['loading', 'attribute_0', 'measurement_17', 'measurement_0', 'measurement_1', 'measurement_2', 'area', 'm3_missing', 'm5_missing', 'measurement_avg', 'measurement_std', 'measurement_median', 'measurement_min', 'measurement_skew']
    
    return df_train, df_test, features

df_train, df_test, features = preprocessing(train_df, test_df)

Columns selected by correlation sum of the 3 first rows: 


Unnamed: 0,Selected columns,correlation total
0,measurement_8,0.454
1,measurement_11,0.395
2,measurement_5,0.386
3,measurement_6,0.365
4,measurement_7,0.336
5,measurement_4,0.331
6,measurement_15,0.301
7,measurement_10,0.3
8,measurement_16,0.252
9,measurement_14,0.225


False

-------- Product code A ----------

filled by linear model :
measurement_17 : 386
measurement_8 : 167
measurement_11 : 225


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_5 : 113
measurement_6 : 146
measurement_7 : 153
measurement_4 : 79
measurement_15 : 273
measurement_10 : 209
measurement_16 : 293
measurement_14 : 237

2281 filled by linear model 
1568 filled by KNN 

-------- Product code B ----------

filled by linear model :
measurement_17 : 418
measurement_8 : 165
measurement_11 : 220
measurement_5 : 83
measurement_6 : 106
measurement_7 : 176
measurement_4 : 80
measurement_15 : 294
measurement_10 : 197
measurement_16 : 358
measurement_14 : 330

2427 filled by linear model 
1548 filled by KNN 

-------- Product code C ----------

filled by linear model :
measurement_17 : 391
measurement_8 : 211
measurement_11 : 231
measurement_5 : 141
measurement_6 : 150
measurement_7 : 140


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_4 : 110
measurement_15 : 319
measurement_10 : 262
measurement_16 : 343
measurement_14 : 340

2638 filled by linear model 
1706 filled by KNN 

-------- Product code D ----------

filled by linear model :
measurement_17 : 398
measurement_8 : 146
measurement_11 : 265
measurement_5 : 87
measurement_6 : 118
measurement_7 : 146
measurement_4 : 88
measurement_15 : 313
measurement_10 : 174
measurement_16 : 322
measurement_14 : 316

2373 filled by linear model 
1600 filled by KNN 

-------- Product code E ----------

filled by linear model :
measurement_17 : 429


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_8 : 171
measurement_11 : 244


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_5 : 116
measurement_6 : 127
measurement_7 : 185
measurement_4 : 105
measurement_15 : 315
measurement_10 : 193
measurement_16 : 316
measurement_14 : 297

2498 filled by linear model 
1634 filled by KNN 

-------- Product code F ----------

filled by linear model :
measurement_17 : 420
measurement_8 : 194
measurement_11 : 226


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_5 : 90
measurement_6 : 137
measurement_7 : 147
measurement_4 : 91
measurement_15 : 333
measurement_10 : 186
measurement_16 : 356
measurement_14 : 348

2528 filled by linear model 
1545 filled by KNN 

-------- Product code G ----------

filled by linear model :
measurement_17 : 373


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_8 : 188
measurement_11 : 221
measurement_5 : 104
measurement_6 : 146
measurement_7 : 145


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_4 : 93
measurement_15 : 299
measurement_10 : 226
measurement_16 : 343
measurement_14 : 268

2406 filled by linear model 
1518 filled by KNN 

-------- Product code H ----------

filled by linear model :
measurement_17 : 361


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_8 : 147
measurement_11 : 205


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_5 : 112


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_6 : 121
measurement_7 : 158
measurement_4 : 75
measurement_15 : 299
measurement_10 : 217
measurement_16 : 340
measurement_14 : 283

2318 filled by linear model 
1565 filled by KNN 

-------- Product code I ----------

filled by linear model :
measurement_17 : 377
measurement_8 : 192
measurement_11 : 209


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_5 : 119
measurement_6 : 132
measurement_7 : 136


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_4 : 89
measurement_15 : 350
measurement_10 : 246
measurement_16 : 294
measurement_14 : 283

2427 filled by linear model 
1402 filled by KNN 


In [7]:
fit_features = ['loading', 'attribute_0', 'measurement_17', 'measurement_0', 'measurement_1', 'measurement_2', 'area', 'm3_missing', 'm5_missing', 'measurement_avg']

In [8]:
models = []
with open(root + '/' + MODEL_FILE, "rb") as f:
    while True:
        try:
            models.append(pickle.load(f))
        except EOFError:
            break

In [9]:
# Average the prediction from 10 models
preds = np.zeros(shape=(10, len(df_test)))
for i in range(10):
    clf = models[i]
    preds[i] = clf.predict_proba(df_test[fit_features])[:, 1]

preds = preds.sum(axis=0) / 10

In [11]:
submission['failure'] = preds
submission['failure'] = submission['failure'].rank(pct=True).values
submission.to_csv('109705004_3.csv', index=False)