In [126]:
import tensorflow as tf
from tensorflow import keras
from keras import layers

In [127]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

In [128]:
# Read file to pd dataframe
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
# Get rid of "id" and "product_code"
train = train.iloc[:train.shape[0], 2:train.shape[1]]
test = test.iloc[:test.shape[0], 2:test.shape[1]]

In [129]:
# Map the value in attribute_0 and attribute_1
attribute_0 = {"material_5": "0", "material_7": "1"}
attribute_1 = {"material_5": "0", "material_6": "1",
               "material_7": "2", "material_8": "3"}
# Replace the value in attribute_0 and attribute_1
train = train.replace({"attribute_0": attribute_0})
train = train.replace({"attribute_1": attribute_1})
test = test.replace({"attribute_0": attribute_0})
test = test.replace({"attribute_1": attribute_1})
# Change the column type from obejct to int
train["attribute_0"] = train["attribute_0"].astype(int)
train["attribute_1"] = train["attribute_1"].astype(int)
test["attribute_0"] = test["attribute_0"].astype(int)
test["attribute_1"] = test["attribute_1"].astype(int)

In [130]:
# Fill the Nan value with the average
for i in train.columns:
    null_cols=train[train[i].isna()].index
    for n in null_cols:
        train.loc[n,i]=train[i].mean()
for i in test.columns:
    null_cols = test[test[i].isna()].index
    for n in null_cols:
        test.loc[n,i]=test[i].mean()

In [131]:
# Apply feature engineering from reference in discussion
# Multiply length dimensions to get area. attributes 2 and 3 look like they are width and length dimensions or similar.
# Aggregate measurement_3 to 16 into average and stdev. They look like they belong to the same group.
def mean_3_to_16(row):
    return row.iloc[9:20].mean()

def std_3_to_16(row):
    return row.iloc[9:20].std()

def attri_multi(row):
    return row.iloc[3] * row.iloc[4]

train["measurement_std"] = train.apply(lambda row : std_3_to_16(row), axis = 1)
train['measurement_mean'] = train.apply(lambda row : mean_3_to_16(row), axis = 1)
train['attri_2_attri_3'] = train.apply(lambda row : attri_multi(row), axis = 1)
remain_col = ["loading", "attribute_0", "attribute_1", "attri_2_attri_3", "measurement_0",
              "measurement_1", "measurement_2", "measurement_17", "measurement_mean", "measurement_std", "failure"]
train = train[remain_col]

test["measurement_std"] = test.apply(lambda row: std_3_to_16(row), axis=1)
test['measurement_mean'] = test.apply(lambda row: mean_3_to_16(row), axis=1)
test['attri_2_attri_3'] = test.apply(lambda row: attri_multi(row), axis=1)
remain_col = ["loading", "attribute_0", "attribute_1", "attri_2_attri_3", "measurement_0",
              "measurement_1", "measurement_2", "measurement_17", "measurement_mean", "measurement_std"]
test =test[remain_col]

In [141]:
# Split the feature and target
train_x = train.iloc[:train.shape[0] , :train.shape[1] - 1].values
train_y = train.iloc[:train.shape[0], train.shape[1] - 1:].values

In [142]:
train_x.shape

(26570, 10)

In [137]:
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [189]:
def build_model():
    model = keras.models.Sequential()
    model.add(layers.Dense(50, activation='relu', input_shape = (10,)))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(50, activation='relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(1, activation='sigmoid'))
    # Compile the model
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                metrics=['accuracy'])
    return model
estimator = []
estimator.append(('standardize', StandardScaler()))
estimator.append(('mlp', KerasClassifier(
    model=build_model, epochs=100, batch_size=16, verbose=0)))
pipeline = Pipeline(estimator)
kfold = StratifiedKFold(n_splits=10, shuffle=True)
results = cross_val_score(pipeline, train_x, train_y, cv=kfold)
print("Acc: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))


Acc: 78.65% (0.13%)


In [190]:
pipeline.fit(train_x, train_y)

In [162]:
print(test.values.shape)

(20775, 10)


In [175]:
y_pred = pipeline.predict(test.values)
y_pred = np.array(y_pred)
y_pred = pd.DataFrame(y_pred, columns= ["failure"])
submission = pd.read_csv("data/sample_submission.csv")
sub = pd.DataFrame({'id': submission.id, 'failure': y_pred.failure})
print(sub)
sub.to_csv("submission.csv", index=False)


          id  failure
0      26570        0
1      26571        0
2      26572        0
3      26573        0
4      26574        0
...      ...      ...
20770  47340        0
20771  47341        0
20772  47342        0
20773  47343        0
20774  47344        0

[20775 rows x 2 columns]
