# Import modules

In [1]:
import pandas as pd
from itertools import combinations

# Load data

In [2]:
data = pd.read_csv('../../data/ai4i2020.csv', index_col='UDI')

# Analysis

In [3]:
data.describe()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,300.00493,310.00556,1538.7761,39.98691,107.951,0.0339,0.0046,0.0115,0.0095,0.0098,0.0019
std,2.000259,1.483734,179.284096,9.968934,63.654147,0.180981,0.067671,0.106625,0.097009,0.098514,0.04355
min,295.3,305.7,1168.0,3.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,298.3,308.8,1423.0,33.2,53.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,300.1,310.1,1503.0,40.1,108.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,301.5,311.1,1612.0,46.8,162.0,0.0,0.0,0.0,0.0,0.0,0.0
max,304.5,313.8,2886.0,76.6,253.0,1.0,1.0,1.0,1.0,1.0,1.0


In [3]:
data['Machine failure'].value_counts()

0    9661
1     339
Name: Machine failure, dtype: int64

In [4]:
# multiple failures
fails = data['TWF'] + data['HDF'] + data['PWF'] + data['OSF'] + data['RNF']
fails.value_counts()

0    9652
1     324
2      23
3       1
dtype: int64

In [5]:
# data.iloc[:, -5:].sum()

In [6]:
# data.iloc[:, -5:].sum().sum()

In [7]:
# possible failure modes
failures = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']
combis2 = combinations(failures, 2)
combis3 = combinations(failures, 3)

In [8]:
total = 0
for fm in failures:
    cur = [i for i in failures if (i != fm)]
    idx = data[data[fm] == 1]
    total += idx[idx[cur] == 0].shape[0]
    print(f'{fm} - {idx[idx[cur] == 0].shape[0]}')
print(f'total - {total}')

TWF - 46
HDF - 115
PWF - 95
OSF - 98
RNF - 19
total - 373


In [9]:
print('combination - # of combined failures - sum of MF column')
total = 0
for c2 in combis2:
    out = data.loc[(data[c2[0]] == 1) & (data[c2[1]] == 1)].copy()
    out_s = out.shape[0]
    total += out_s
    print(f'{c2} - {out_s} - {out["Machine failure"].sum()}')
print(f'total - {total}')

combination - # of combined failures - sum of MF column
('TWF', 'HDF') - 0 - 0
('TWF', 'PWF') - 1 - 1
('TWF', 'OSF') - 3 - 3
('TWF', 'RNF') - 1 - 1
('HDF', 'PWF') - 3 - 3
('HDF', 'OSF') - 6 - 6
('HDF', 'RNF') - 0 - 0
('PWF', 'OSF') - 12 - 12
('PWF', 'RNF') - 0 - 0
('OSF', 'RNF') - 0 - 0
total - 26


In [10]:
total = 0
for c3 in combis3:
    out = data.loc[(data[c3[0]] == 1) & (data[c3[1]] == 1) & (data[c3[2]] == 1)].copy()
    out_s = out.shape[0]
    total += out_s
    print(f'{c3} - {out_s} - {out["Machine failure"].sum()}')
print(f'total - {total}')

('TWF', 'HDF', 'PWF') - 0 - 0
('TWF', 'HDF', 'OSF') - 0 - 0
('TWF', 'HDF', 'RNF') - 0 - 0
('TWF', 'PWF', 'OSF') - 1 - 1
('TWF', 'PWF', 'RNF') - 0 - 0
('TWF', 'OSF', 'RNF') - 0 - 0
('HDF', 'PWF', 'OSF') - 0 - 0
('HDF', 'PWF', 'RNF') - 0 - 0
('HDF', 'OSF', 'RNF') - 0 - 0
('PWF', 'OSF', 'RNF') - 0 - 0
total - 1


# Cleaning

In [11]:
def clean_data(data, drop_type=True, encode_mf=False):
    ''' clean the data '''

    if drop_type:
        data.drop(columns='Type', inplace=True)

    if encode_mf:
        # needs improvement because of multiple failure modes
        # from itertools import combinations, permutations
        # failures = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']
        # list(permutations(failures, 2))
        data['Machine failure'] = 0
        data['Machine failure'][data['TWF'] == 1] = 1
        data['Machine failure'][data['HDF'] == 1] = 2
        data['Machine failure'][data['PWF'] == 1] = 3
        data['Machine failure'][data['OSF'] == 1] = 4
        data['Machine failure'][data['RNF'] == 1] = 5

    data.drop(columns=['Product ID', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF'], inplace=True)

    return data