In [1]:
import numpy as np
import pandas as pd
from itertools import cycle, islice
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import GroupKFold
import math

from sklearn.preprocessing import StandardScaler

plt.style.use("fivethirtyeight")
color_pal = plt.rcParams["axes.prop_cycle"].by_key()["color"]
color_cycle = cycle(plt.rcParams["axes.prop_cycle"].by_key()["color"])

train_df = pd.read_csv("../input/tabular-playground-series-aug-2022/train.csv", index_col='id')
test_df = pd.read_csv('../input/tabular-playground-series-aug-2022/test.csv', index_col='id')


def data_process(train, test):
    train['isTrain'] = True
    test['isTrain'] = False

    data = pd.concat([train, test])

    data['isM3'] = data.measurement_3.isna()
    data['isM5'] = data.measurement_5.isna()
    data['isM3andM5'] = data['isM3'] * data['isM5']

    materials = ['attribute_0', 'attribute_1']

    for col in materials:
        dummies = pd.get_dummies(data[col], prefix=col)
        data = data.merge(dummies, left_index=True, right_index=True)
    data = data.drop(materials, axis=1)

    data = data.drop('attribute_0_material_5', axis=1)

    data['area'] = data['attribute_2'] * data['attribute_3']
    data = data.drop(['attribute_2', 'attribute_3'], axis=1)

    data['loading'] = data['loading'].fillna(data['loading'].mean())

    data['round_loading'] = data['loading'].apply(lambda x: math.floor(x / 10) * 10)

    features = [a for a in test.columns if a.startswith('measurement')]
    frames = []

    for code in data.product_code.unique():
        df = data[data.product_code == code].copy()
        imputer = KNNImputer(n_neighbors=7)
        imputer.fit(df[features])
        df[features] = imputer.transform(df[features])
        frames.append(df)

    data = pd.concat(frames)

    data['m3*m5'] = data['measurement_3'] * data['measurement_5']

    data['measurement_avg'] = data[[f'measurement_{i}' for i in range(3, 17)]].mean(axis=1)
    data = data.drop([f'measurement_{i}' for i in range(3, 17)], axis=1)

    scaler = StandardScaler()

    columns = [a for a in data.columns if a not in ['product_code', 'isTrain', 'failure', 'isM3', 'isM5']]
    data[columns] = scaler.fit_transform(data[columns])

    tr_df = data[data['isTrain'] == True].drop('isTrain', axis=1)
    ts_df = data[data['isTrain'] == False].drop(['isTrain', 'failure'], axis=1)

    return tr_df, ts_df


train_df, test_df = data_process(train_df, test_df)

train_list = []
test_list = []

train_df['cat_failure'] = train_df['product_code'].astype(str) + train_df['failure'].astype(str)

kfold = StratifiedKFold(n_splits=10, random_state=11, shuffle=True)
X = train_df.drop('cat_failure', axis=1)
y = train_df.cat_failure
for train_index, test_index in kfold.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = X.iloc[train_index].failure, X.iloc[test_index].failure

    train_list.append((X_train, y_train))
    test_list.append((X_test, y_test))


In [2]:
scores = []
test_predictions = []

for i in range(10):
    y_train = train_list[i][1]
    X_train = train_list[i][0].drop(['product_code', 'failure'], axis=1)

    y_test = test_list[i][1]
    X_test = test_list[i][0].drop(['product_code', 'failure'], axis=1)

    model = XGBRegressor(n_estimators=100, booster='gblinear',learning_rate = 0.3, reg_lambda=0.2, updater='coord_descent',
                         feature_selector='greedy')
    # Fit model
    model.fit(X_train, y_train, verbose=0)
    # Get predictions
    score = roc_auc_score(y_test, model.predict(X_test))
    scores.append(score)

    test_predictions.append(model.predict(test_df.drop('product_code', axis=1)))

    print(f'FOLD {i}: {score}')

print('')
print(f' Total Average: {np.mean(scores)}')

FOLD 0: 0.5968181101039763
FOLD 1: 0.5901184461545239
FOLD 2: 0.589373392564882
FOLD 3: 0.6022896314293169
FOLD 4: 0.5987876275402291
FOLD 5: 0.5836151232017414
FOLD 6: 0.6078304630479272
FOLD 7: 0.5560060237905886
FOLD 8: 0.5765496752868173
FOLD 9: 0.6110683004564406

 Total Average: 0.5912456793576444


In [3]:
scores =[]

for i in range(10):
    y_train = train_list[i][1]
    X_train = train_list[i][0].drop(['product_code','failure'], axis=1)
    
    y_test = test_list[i][1]
    X_test = test_list[i][0].drop(['product_code','failure'],axis=1)
    
    model = LogisticRegression(max_iter=500, C=1, penalty='none', solver='saga')
    
    # Fit model
    model.fit(X_train, y_train)
    
    # Get predictions
    score = roc_auc_score(y_test,model.predict_proba(X_test)[:,1])
    scores.append(score)
    
    test_predictions.append(model.predict_proba(test_df.drop('product_code',axis=1))[:,1])
    
    print(f'FOLD {i}: {score}')

print('')
print(f' Total Average: {np.mean(scores)}')

FOLD 0: 0.594577079738948
FOLD 1: 0.5949754839671584
FOLD 2: 0.5895783987828391
FOLD 3: 0.6039000315133525
FOLD 4: 0.5992461801384118
FOLD 5: 0.5867346680118225
FOLD 6: 0.6013345094997407
FOLD 7: 0.5544721568892874
FOLD 8: 0.5781255016873594
FOLD 9: 0.6150023743014399

 Total Average: 0.5917946384530359


In [4]:
predictions = [np.mean(a) for a in zip(*test_predictions)]

In [5]:
submission = pd.read_csv('../input/tabular-playground-series-aug-2022/sample_submission.csv', index_col='id')
submission['failure'] = predictions

In [6]:
submission.to_csv('submission_1')