In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, f1_score

In [42]:
df_train = pd.read_csv("train.csv")
df_train.head()

Unnamed: 0,id,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,0,L50096,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,1,M20343,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,2,L49454,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,3,L53355,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,4,M24050,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0


In [43]:
df_test = pd.read_csv("test.csv")
df_test.head()

Unnamed: 0,id,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF
0,136429,L50896,L,302.3,311.5,1499,38.0,60,0,0,0,0,0
1,136430,L53866,L,301.7,311.0,1713,28.8,17,0,0,0,0,0
2,136431,L50498,L,301.3,310.4,1525,37.7,96,0,0,0,0,0
3,136432,M21232,M,300.1,309.6,1479,47.6,5,0,0,0,0,0
4,136433,M19751,M,303.4,312.3,1515,41.3,114,0,0,0,0,0


In [44]:
df_train.columns = [col.replace('[', '').replace(']', '').replace('<', '') if col not in ['Type'] else col for col in df_train.columns]
df_test.columns = [col.replace('[', '').replace(']', '').replace('<', '') if col not in ['Type'] else col for col in df_test.columns]
df_train.columns

Index(['id', 'Product ID', 'Type', 'Air temperature K',
       'Process temperature K', 'Rotational speed rpm', 'Torque Nm',
       'Tool wear min', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF'],
      dtype='object')

In [45]:
df_train.isnull().sum()

id                       0
Product ID               0
Type                     0
Air temperature K        0
Process temperature K    0
Rotational speed rpm     0
Torque Nm                0
Tool wear min            0
Machine failure          0
TWF                      0
HDF                      0
PWF                      0
OSF                      0
RNF                      0
dtype: int64

In [46]:
df_test.isnull().sum()

id                       0
Product ID               0
Type                     0
Air temperature K        0
Process temperature K    0
Rotational speed rpm     0
Torque Nm                0
Tool wear min            0
TWF                      0
HDF                      0
PWF                      0
OSF                      0
RNF                      0
dtype: int64

In [47]:
df_train.describe()

Unnamed: 0,id,Air temperature K,Process temperature K,Rotational speed rpm,Torque Nm,Tool wear min,Machine failure,TWF,HDF,PWF,OSF,RNF
count,136429.0,136429.0,136429.0,136429.0,136429.0,136429.0,136429.0,136429.0,136429.0,136429.0,136429.0,136429.0
mean,68214.0,299.862776,309.94107,1520.33111,40.348643,104.408901,0.015744,0.001554,0.00516,0.002397,0.003958,0.002258
std,39383.804275,1.862247,1.385173,138.736632,8.502229,63.96504,0.124486,0.039389,0.071649,0.048899,0.062789,0.047461
min,0.0,295.3,305.8,1181.0,3.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,34107.0,298.3,308.7,1432.0,34.6,48.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,68214.0,300.0,310.0,1493.0,40.4,106.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,102321.0,301.2,310.9,1580.0,46.1,159.0,0.0,0.0,0.0,0.0,0.0,0.0
max,136428.0,304.4,313.8,2886.0,76.6,253.0,1.0,1.0,1.0,1.0,1.0,1.0


In [48]:
df_test.describe()

Unnamed: 0,id,Air temperature K,Process temperature K,Rotational speed rpm,Torque Nm,Tool wear min,TWF,HDF,PWF,OSF,RNF
count,90954.0,90954.0,90954.0,90954.0,90954.0,90954.0,90954.0,90954.0,90954.0,90954.0,90954.0
mean,181905.5,299.859493,309.939375,1520.528179,40.335191,104.293962,0.001473,0.005343,0.002353,0.00387,0.002309
std,26256.302529,1.857562,1.385296,139.970419,8.504683,63.871092,0.038355,0.072903,0.048449,0.06209,0.047995
min,136429.0,295.3,305.7,1168.0,3.8,0.0,0.0,0.0,0.0,0.0,0.0
25%,159167.25,298.3,308.7,1432.0,34.6,48.0,0.0,0.0,0.0,0.0,0.0
50%,181905.5,300.0,310.0,1493.0,40.5,106.0,0.0,0.0,0.0,0.0,0.0
75%,204643.75,301.2,310.9,1579.0,46.2,158.0,0.0,0.0,0.0,0.0,0.0
max,227382.0,304.4,313.8,2886.0,76.6,253.0,1.0,1.0,1.0,1.0,1.0


In [49]:
df_train['TorqueMean'] = df_train.groupby('Product ID')['Torque Nm'].transform('mean')
df_train['ToolWearMax'] = df_train.groupby('Product ID')['Tool wear min'].transform('max')

df_train['TorqueLag1'] = df_train.groupby('Product ID')['Torque Nm'].shift(1)
df_train['ToolWearLag1'] = df_train.groupby('Product ID')['Tool wear min'].shift(1)

df_train['TorqueToSpeedRatio'] = df_train['Torque Nm']/df_train['Rotational speed rpm']
df_train['TempRatio'] = df_train['Air temperature K']/df_train['Process temperature K']

df_train['TorqueRollingMean'] = df_train.groupby('Product ID')['Torque Nm'].rolling(window=3, min_periods=1).mean().reset_index(level=0, drop=True)

In [50]:
df_test['TorqueMean'] = df_test.groupby('Product ID')['Torque Nm'].transform('mean')
df_test['ToolWearMax'] = df_test.groupby('Product ID')['Tool wear min'].transform('max')

df_test['TorqueLag1'] = df_test.groupby('Product ID')['Torque Nm'].shift(1)
df_test['ToolWearLag1'] = df_test.groupby('Product ID')['Tool wear min'].shift(1)

df_test['TorqueToSpeedRatio'] = df_test['Torque Nm']/df_test['Rotational speed rpm']
df_test['TempRatio'] = df_test['Air temperature K']/df_test['Process temperature K']

df_test['TorqueRollingMean'] = df_test.groupby('Product ID')['Torque Nm'].rolling(window=3, min_periods=1).mean().reset_index(level=0, drop=True)

In [51]:
df_train.head(10)

Unnamed: 0,id,Product ID,Type,Air temperature K,Process temperature K,Rotational speed rpm,Torque Nm,Tool wear min,Machine failure,TWF,...,PWF,OSF,RNF,TorqueMean,ToolWearMax,TorqueLag1,ToolWearLag1,TorqueToSpeedRatio,TempRatio,TorqueRollingMean
0,0,L50096,L,300.6,309.6,1596,36.1,140,0,0,...,0,0,0,39.084,221,,,0.022619,0.97093,36.1
1,1,M20343,M,302.6,312.1,1759,29.1,200,0,0,...,0,0,0,35.56,200,,,0.016543,0.969561,29.1
2,2,L49454,L,299.3,308.5,1805,26.5,25,0,0,...,0,0,0,40.4,173,,,0.014681,0.970178,26.5
3,3,L53355,L,301.0,310.9,1524,44.3,197,0,0,...,0,0,0,41.024138,197,,,0.029068,0.968157,44.3
4,4,M24050,M,298.0,309.0,1641,35.4,34,0,0,...,0,0,0,39.8875,98,,,0.021572,0.964401,35.4
5,5,M24300,M,298.4,308.9,1429,42.1,65,0,0,...,0,0,0,46.7,198,,,0.029461,0.966008,42.1
6,6,L56736,L,299.6,311.0,1413,42.9,156,0,0,...,0,0,0,41.577273,179,,,0.030361,0.963344,42.9
7,7,L55488,L,298.7,310.1,1609,38.1,67,0,0,...,0,0,0,40.842857,192,,,0.023679,0.963238,38.1
8,8,L56397,L,297.7,308.8,1578,35.2,13,0,0,...,0,0,0,39.844444,171,,,0.022307,0.964054,35.2
9,9,L55067,L,300.5,312.3,1447,53.3,98,0,0,...,0,0,0,40.685714,179,,,0.036835,0.962216,53.3


In [52]:
df_s = pd.get_dummies(df_train['Type'])
df_train = df_train.drop('Type',axis=1)
df_train = pd.concat([df_train, df_s], axis=1)
y = df_train['Machine failure']
X = df_train.drop(['id', 'Product ID', 'Machine failure'],axis=1)

X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2, random_state=42)

In [53]:
import xgboost as xgb
param_grid = {
    'n_estimators':[100,200,300],
    'learning_rate':[0.05, 0.1, 0.2],
    'max_depth':[3,4,5]
}
model = xgb.XGBRegressor(objective='reg:logistic')

grid_search = GridSearchCV(model, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
print('Best Hyperparameter: ',grid_search.best_params_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_val)

Best Hyperparameter:  {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 300}


In [54]:
df_s_t = pd.get_dummies(df_test['Type'])
df_test = df_test.drop('Type', axis=1)
df_test = pd.concat([df_test, df_s_t],axis=1)
new_X = df_test.drop(['id','Product ID'],axis=1)
prediction = best_model.predict(new_X)
print(prediction)

[0.00096382 0.00532002 0.00036067 ... 0.00020597 0.0072515  0.00232619]


In [55]:
sample = pd.read_csv("sample_submission.csv")
sample.head()

Unnamed: 0,id,Machine failure
0,136429,0.5
1,136430,0.5
2,136431,0.5
3,136432,0.5
4,136433,0.5


In [56]:
sample['Machine failure'] = prediction
sample.head(10)

Unnamed: 0,id,Machine failure
0,136429,0.000964
1,136430,0.00532
2,136431,0.000361
3,136432,0.000334
4,136433,0.000641
5,136434,0.000342
6,136435,0.002286
7,136436,0.001036
8,136437,0.00064
9,136438,0.000785


In [None]:
sample.to_csv('submission.csv', index=False)