In [4]:
import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'microsoft-azure-predictive-maintenance:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F922230%2F1561528%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240623%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240623T062545Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D6efa68851b6af962b91ceb33cb912208832bd38e19145757c332eac8b6a49dc4b95b4fc230b1f585fa13f00976a74b5bd75afbd0b7f617a35b148335fc3101b98bf5881b28d20cc529b5b5d7a57345f6cbe0caa7833dbd9de65e96c6f469cdf37fdd6da5aa0bc12223da3ab2bbf421db01e8b929ec9fded606d4e0ed1e32ff731c6f9c0d2c7aa90e016e8006c4f64bc0e36a16351f96676bb1f0172c1f2a91fe93b945ecff1d3dc8c635c2068d98f932ebe0d312bdad8412264ae03cd584667f39c66cc7335aabe3a63472a7fcf2cc60c773b3a9da1cffbf43ab5324a4e902ff5ae7b67aeec234dce28e51cdd3aec4c6571f48f5ca999cda80d04c164ee8d721'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')

Downloading microsoft-azure-predictive-maintenance, 32497141 bytes compressed
Downloaded and uncompressed: microsoft-azure-predictive-maintenance
Data source import complete.


In [5]:
import pandas as pd
import numpy as np

In [6]:
telemetry = pd.read_csv('../input/microsoft-azure-predictive-maintenance/PdM_telemetry.csv')
errors = pd.read_csv('../input/microsoft-azure-predictive-maintenance/PdM_errors.csv')
maint = pd.read_csv('../input/microsoft-azure-predictive-maintenance/PdM_maint.csv')
failures = pd.read_csv('../input/microsoft-azure-predictive-maintenance/PdM_failures.csv')
machines = pd.read_csv('../input/microsoft-azure-predictive-maintenance/PdM_machines.csv')

In [7]:
telemetry['datetime'] = pd.to_datetime(
    telemetry['datetime'], format="%Y-%m-%d %H:%M:%S")
telemetry.head()

Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration
0,2015-01-01 06:00:00,1,176.217853,418.504078,113.077935,45.087686
1,2015-01-01 07:00:00,1,162.879223,402.74749,95.460525,43.413973
2,2015-01-01 08:00:00,1,170.989902,527.349825,75.237905,34.178847
3,2015-01-01 09:00:00,1,162.462833,346.149335,109.248561,41.122144
4,2015-01-01 10:00:00,1,157.610021,435.376873,111.886648,25.990511


# New Section

In [8]:
machines.head()

Unnamed: 0,machineID,model,age
0,1,model3,18
1,2,model4,7
2,3,model3,8
3,4,model3,7
4,5,model3,2


In [9]:

errors['datetime'] = pd.to_datetime(
    errors['datetime'], format="%Y-%m-%d %H:%M:%S")
errors.head()

Unnamed: 0,datetime,machineID,errorID
0,2015-01-03 07:00:00,1,error1
1,2015-01-03 20:00:00,1,error3
2,2015-01-04 06:00:00,1,error5
3,2015-01-10 15:00:00,1,error4
4,2015-01-22 10:00:00,1,error4


In [11]:
maint['datetime'] = pd.to_datetime(
    maint['datetime'], format="%Y-%m-%d %H:%M:%S")
maint['comp'] = maint['comp'].astype('object')
maint.head()

Unnamed: 0,datetime,machineID,comp
0,2014-06-01 06:00:00,1,comp2
1,2014-07-16 06:00:00,1,comp4
2,2014-07-31 06:00:00,1,comp3
3,2014-12-13 06:00:00,1,comp1
4,2015-01-05 06:00:00,1,comp4


In [12]:
failures['datetime'] = pd.to_datetime(
    failures['datetime'], format="%Y-%m-%d %H:%M:%S")
failures.head()

Unnamed: 0,datetime,machineID,failure
0,2015-01-05 06:00:00,1,comp4
1,2015-03-06 06:00:00,1,comp1
2,2015-04-20 06:00:00,1,comp2
3,2015-06-19 06:00:00,1,comp4
4,2015-09-02 06:00:00,1,comp4


In [13]:
temp = []
fields = ['volt', 'pressure', 'vibration']
for col in fields:
    temp.append(pd.pivot_table(telemetry,
                               index='datetime',
                               columns='machineID',
                               values=col).resample('3H', closed='left', label='right').mean().unstack())
telemetry_mean_3h = pd.concat(temp, axis=1)
telemetry_mean_3h.columns = [i + 'mean_3h' for i in fields]
telemetry_mean_3h.reset_index(inplace=True)
temp = []
for col in fields:
    temp.append(pd.pivot_table(telemetry,
                               index='datetime',
                               columns='machineID',
                               values=col).resample('3H', closed='left', label='right').std().unstack())
telemetry_sd_3h = pd.concat(temp, axis=1)
telemetry_sd_3h.columns = [i + 'sd_3h' for i in fields]
telemetry_sd_3h.reset_index(inplace=True)

telemetry_mean_3h.head()

Unnamed: 0,machineID,datetime,voltmean_3h,pressuremean_3h,vibrationmean_3h
0,1,2015-01-01 09:00:00,170.028993,94.592122,40.893502
1,1,2015-01-01 12:00:00,164.192565,105.687417,34.255891
2,1,2015-01-01 15:00:00,168.134445,107.793709,41.239405
3,1,2015-01-01 18:00:00,165.514453,101.703289,40.373739
4,1,2015-01-01 21:00:00,168.809347,90.91106,41.738542


In [14]:
telemetry_feat = pd.concat([telemetry_mean_3h,
                            telemetry_sd_3h.iloc[:, 2:6]
                            ], axis=1).dropna()
telemetry_feat.head()

Unnamed: 0,machineID,datetime,voltmean_3h,pressuremean_3h,vibrationmean_3h,voltsd_3h,pressuresd_3h,vibrationsd_3h
0,1,2015-01-01 09:00:00,170.028993,94.592122,40.893502,6.721032,18.934956,5.87497
1,1,2015-01-01 12:00:00,164.192565,105.687417,34.255891,7.59657,8.555032,7.662229
2,1,2015-01-01 15:00:00,168.134445,107.793709,41.239405,10.124584,5.909721,5.169304
3,1,2015-01-01 18:00:00,165.514453,101.703289,40.373739,4.673269,4.554047,2.106108
4,1,2015-01-01 21:00:00,168.809347,90.91106,41.738542,14.752132,4.244158,2.207884


In [15]:
error_count = pd.get_dummies(errors.set_index('datetime')).reset_index()
error_count.columns = ['datetime', 'machineID',
                       'error1', 'error2', 'error3', 'error4', 'error5']
error_count.head(10)

Unnamed: 0,datetime,machineID,error1,error2,error3,error4,error5
0,2015-01-03 07:00:00,1,True,False,False,False,False
1,2015-01-03 20:00:00,1,False,False,True,False,False
2,2015-01-04 06:00:00,1,False,False,False,False,True
3,2015-01-10 15:00:00,1,False,False,False,True,False
4,2015-01-22 10:00:00,1,False,False,False,True,False
5,2015-01-25 15:00:00,1,False,False,False,True,False
6,2015-01-27 04:00:00,1,True,False,False,False,False
7,2015-03-03 22:00:00,1,False,True,False,False,False
8,2015-03-05 06:00:00,1,True,False,False,False,False
9,2015-03-20 18:00:00,1,True,False,False,False,False


In [16]:
error_count = telemetry[['datetime', 'machineID']].merge(
    error_count, on=['machineID', 'datetime'], how='left').fillna(0.0)
error_count.head()

Unnamed: 0,datetime,machineID,error1,error2,error3,error4,error5
0,2015-01-01 06:00:00,1,0.0,0.0,0.0,0.0,0.0
1,2015-01-01 07:00:00,1,0.0,0.0,0.0,0.0,0.0
2,2015-01-01 08:00:00,1,0.0,0.0,0.0,0.0,0.0
3,2015-01-01 09:00:00,1,0.0,0.0,0.0,0.0,0.0
4,2015-01-01 10:00:00,1,0.0,0.0,0.0,0.0,0.0


In [18]:
temp = []
fields = ['error%d' % i for i in range(1, 6)]
for col in fields:
    temp.append(pd.pivot_table(error_count,index='datetime',columns='machineID',values=col).resample('3H',
                                                                               closed='left',
                                                                               label='right',
                                                                               ).first().unstack().rolling(window=24, center=False).sum())
error_count = pd.concat(temp, axis=1)
error_count.columns = [i + 'count' for i in fields]
error_count.reset_index(inplace=True)
error_count = error_count.dropna()
error_count.describe()

Unnamed: 0,machineID,datetime,error1count,error2count,error3count,error4count,error5count
count,292077.0,292077,292077.0,292077.0,292077.0,292077.0,292077.0
mean,50.503898,2015-07-02 21:20:32.317505280,0.037483,0.029266,0.025117,0.025541,0.018666
min,1.0,2015-01-01 09:00:00,0.0,0.0,0.0,0.0,0.0
25%,26.0,2015-04-02 15:00:00,0.0,0.0,0.0,0.0,0.0
50%,51.0,2015-07-02 21:00:00,0.0,0.0,0.0,0.0,0.0
75%,76.0,2015-10-02 03:00:00,0.0,0.0,0.0,0.0,0.0
max,100.0,2016-01-01 09:00:00,2.0,2.0,1.5,3.0,2.0
std,28.863914,,0.192195,0.154875,0.140627,0.158549,0.134294


In [19]:
error_count.head()

Unnamed: 0,machineID,datetime,error1count,error2count,error3count,error4count,error5count
23,1,2015-01-04 06:00:00,0.0,0.0,0.0,0.0,0.0
24,1,2015-01-04 09:00:00,0.0,0.0,0.0,0.0,1.0
25,1,2015-01-04 12:00:00,0.0,0.0,0.0,0.0,1.0
26,1,2015-01-04 15:00:00,0.0,0.0,0.0,0.0,1.0
27,1,2015-01-04 18:00:00,0.0,0.0,0.0,0.0,1.0


In [22]:
comp_rep = pd.get_dummies(maint.set_index('datetime')).reset_index()
comp_rep.columns = ['datetime', 'machineID',
                    'comp1', 'comp2', 'comp3', 'comp4']
comp_rep = telemetry[['datetime', 'machineID']].merge(comp_rep,on=['datetime','machineID'],
                                                      how='outer').fillna(0).sort_values(by=['machineID', 'datetime'])

In [23]:
components = ['comp1', 'comp2', 'comp3', 'comp4']
for comp in components:
    comp_rep.loc[comp_rep[comp] < 1, comp] = None
    comp_rep.loc[-comp_rep[comp].isnull(),
                 comp] = comp_rep.loc[-comp_rep[comp].isnull(), 'datetime']
    comp_rep[comp] = comp_rep[comp].fillna(method='ffill')

comp_rep = comp_rep.loc[comp_rep['datetime'] > pd.to_datetime('2015-01-01')]

In [24]:
for comp in components:
    comp_rep[comp] = (comp_rep["datetime"] - pd.to_datetime(comp_rep[comp])) / np.timedelta64(1, "D")
comp_rep.head()

Unnamed: 0,datetime,machineID,comp1,comp2,comp3,comp4
0,2015-01-01 06:00:00,1,19.0,214.0,154.0,169.0
1,2015-01-01 07:00:00,1,19.041667,214.041667,154.041667,169.041667
2,2015-01-01 08:00:00,1,19.083333,214.083333,154.083333,169.083333
3,2015-01-01 09:00:00,1,19.125,214.125,154.125,169.125
4,2015-01-01 10:00:00,1,19.166667,214.166667,154.166667,169.166667


In [29]:
final_feat = telemetry_feat.merge(
    error_count, on=['datetime', 'machineID'],how='left')
final_feat = final_feat.merge(
    comp_rep, on=['datetime', 'machineID'],how='left')
final_feat = final_feat.merge(machines, on=['machineID'],how='left')
final_feat=final_feat.dropna()
final_feat.head()

Unnamed: 0,machineID,datetime,voltmean_3h,pressuremean_3h,vibrationmean_3h,voltsd_3h,pressuresd_3h,vibrationsd_3h,error1count,error2count,error3count,error4count,error5count,comp1,comp2,comp3,comp4,model,age
23,1,2015-01-04 06:00:00,186.092896,107.989359,55.308074,13.48909,5.118176,4.904365,0.0,0.0,0.0,0.0,0.0,22.0,217.0,157.0,172.0,model3,18
24,1,2015-01-04 09:00:00,166.281848,106.187582,51.99008,24.276228,11.176731,3.394073,0.0,0.0,0.0,0.0,1.0,22.125,217.125,157.125,172.125,model3,18
25,1,2015-01-04 12:00:00,175.412103,100.887363,54.251534,34.918687,10.580336,2.921501,0.0,0.0,0.0,0.0,1.0,22.25,217.25,157.25,172.25,model3,18
26,1,2015-01-04 15:00:00,157.347716,101.28938,48.602686,24.617739,9.966729,2.356486,0.0,0.0,0.0,0.0,1.0,22.375,217.375,157.375,172.375,model3,18
27,1,2015-01-04 18:00:00,176.45055,84.521555,47.638836,8.0714,2.636879,4.108621,0.0,0.0,0.0,0.0,1.0,22.5,217.5,157.5,172.5,model3,18


In [30]:
labeled_features = final_feat.merge(
    failures, on=['datetime', 'machineID'],how='left')
labeled_features = labeled_features.fillna(
    method='bfill', limit=7)
labeled_features = labeled_features.fillna('none')
labeled_features.head()

Unnamed: 0,machineID,datetime,voltmean_3h,pressuremean_3h,vibrationmean_3h,voltsd_3h,pressuresd_3h,vibrationsd_3h,error1count,error2count,error3count,error4count,error5count,comp1,comp2,comp3,comp4,model,age,failure
0,1,2015-01-04 06:00:00,186.092896,107.989359,55.308074,13.48909,5.118176,4.904365,0.0,0.0,0.0,0.0,0.0,22.0,217.0,157.0,172.0,model3,18,none
1,1,2015-01-04 09:00:00,166.281848,106.187582,51.99008,24.276228,11.176731,3.394073,0.0,0.0,0.0,0.0,1.0,22.125,217.125,157.125,172.125,model3,18,comp4
2,1,2015-01-04 12:00:00,175.412103,100.887363,54.251534,34.918687,10.580336,2.921501,0.0,0.0,0.0,0.0,1.0,22.25,217.25,157.25,172.25,model3,18,comp4
3,1,2015-01-04 15:00:00,157.347716,101.28938,48.602686,24.617739,9.966729,2.356486,0.0,0.0,0.0,0.0,1.0,22.375,217.375,157.375,172.375,model3,18,comp4
4,1,2015-01-04 18:00:00,176.45055,84.521555,47.638836,8.0714,2.636879,4.108621,0.0,0.0,0.0,0.0,1.0,22.5,217.5,157.5,172.5,model3,18,comp4


In [31]:
labeled_features.loc[labeled_features['failure'] == 'comp4'][:16]
labeled_features['failure'].value_counts()

failure
none     286633
comp2      2136
comp1      1635
comp4      1340
comp3      1037
Name: count, dtype: int64

In [32]:
threshold_dates = [[pd.to_datetime('2015-07-31 01:00:00'), pd.to_datetime('2015-08-01 01:00:00')],
                   [pd.to_datetime('2015-08-31 01:00:00'), pd.to_datetime('2015-09-01 01:00:00')],
                   [pd.to_datetime('2015-09-30 01:00:00'), pd.to_datetime('2015-10-01 01:00:00')]]

last_train_date, first_test_date = threshold_dates[0]

y_train = labeled_features.loc[labeled_features['datetime'] < last_train_date, 'failure']

In [33]:
X_train = labeled_features.loc[labeled_features['datetime'] < last_train_date].drop(['datetime','failure'], axis=1)

In [34]:
X_train=pd.get_dummies(X_train)
X_train.head()

Unnamed: 0,machineID,voltmean_3h,pressuremean_3h,vibrationmean_3h,voltsd_3h,pressuresd_3h,vibrationsd_3h,error1count,error2count,error3count,...,error5count,comp1,comp2,comp3,comp4,age,model_model1,model_model2,model_model3,model_model4
0,1,186.092896,107.989359,55.308074,13.48909,5.118176,4.904365,0.0,0.0,0.0,...,0.0,22.0,217.0,157.0,172.0,18,False,False,True,False
1,1,166.281848,106.187582,51.99008,24.276228,11.176731,3.394073,0.0,0.0,0.0,...,1.0,22.125,217.125,157.125,172.125,18,False,False,True,False
2,1,175.412103,100.887363,54.251534,34.918687,10.580336,2.921501,0.0,0.0,0.0,...,1.0,22.25,217.25,157.25,172.25,18,False,False,True,False
3,1,157.347716,101.28938,48.602686,24.617739,9.966729,2.356486,0.0,0.0,0.0,...,1.0,22.375,217.375,157.375,172.375,18,False,False,True,False
4,1,176.45055,84.521555,47.638836,8.0714,2.636879,4.108621,0.0,0.0,0.0,...,1.0,22.5,217.5,157.5,172.5,18,False,False,True,False


In [36]:
y_train.unique()

array(['none', 'comp4', 'comp1', 'comp2', 'comp3'], dtype=object)

In [38]:
dict_map = {'none': 0,
            'comp1': 1,
            'comp2': 2,
            'comp3': 3,
            'comp4': 4,
           }

y_train = y_train.replace(dict_map)
y_train.value_counts()

failure
0    165353
2      1199
1      1063
4       796
3       627
Name: count, dtype: int64

In [40]:
y_test = labeled_features.loc[labeled_features['datetime'] >= first_test_date, 'failure']
X_test = pd.get_dummies(labeled_features.loc[labeled_features['datetime'] >= first_test_date].drop(['datetime','failure'], axis=1))

In [41]:
y_test = y_test.replace(dict_map)
y_test.value_counts()

failure
0    120485
2       931
1       572
4       541
3       410
Name: count, dtype: int64