In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, Input, \
                                        BatchNormalization, Embedding, Masking,\
                                        Bidirectional, Conv1D, MaxPooling1D, Flatten, concatenate, MaxPooling1D
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import Model

from sklearn.utils import class_weight, shuffle
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.inspection import permutation_importance

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVC
from xgboost import XGBRegressor, XGBClassifier
import lightgbm

import scipy.stats as stats
import matplotlib.pyplot as plt

import warnings

warnings.filterwarnings("ignore")
BATCH_SIZE = 32
EPOCHS = 300

from datetime import datetime, timedelta, date
import pandas as pd
import numpy as np
import cx_Oracle
import pandas as pd
from sqlalchemy import create_engine
from imblearn.over_sampling import SMOTE
from collections import Counter
import joblib
import shap

In [2]:
def generate_time(start_date:str, end_date:str, hour:int):
        start = datetime.strptime(start_date, '%d/%m/%Y')
        end = datetime.strptime(end_date, '%d/%m/%Y')

        dates = []
        while start<=end:
            row = [start]
            dates.append(row)
            start += timedelta(hours=hour)

        return pd.DataFrame(dates, columns=['TIMESTAMP'])
    
def query_status(eq_id):
    try:
        oracle_string = "oracle+cx_oracle://{username}:{password}@{hostname}:{port}/{database}"
        engine = create_engine(
            oracle_string.format(
                username = 'TFM4CEBERUS',
                password = 'TFM4CEBERUS',
                hostname = 'ome-db.bth.infineon.com',
                port = '1538',
                database = 'ome'
                )
            )
    except Exception as e:
        print(str(e))

    query = f"""select EQ_ID, TIMESTAMP_START, TIMESTAMP_END, DURATION, STATE_NAME, LEVEL3_NAME, LEVEL3 
            from (SELECT
              eq.eq_id, eq.name, eq.eq_type_ident
            , data.timestamp_start,data.timestamp_end
            , ROUND((data.timestamp_end - data.timestamp_start)*24*60*60,0) AS Duration
            , data.tr25_3_status,data.tr25_4_status,data.tr25_5_status,data.eq_status
            , level5s.state_name
            , level5.state_name Level5_Name, level5.state_sign Level5
            , level4.state_name Level4_Name, level4.state_sign Level4
            , level3.state_name Level3_Name, level3.state_sign Level3
            ,mh.device
            ,mh.package,
            mh.lotid as lot,
            mh.product,
            mh.operation

            FROM OMEDATA.EQUIPMENT_STATE_HISTORY data
            , OMEADMIN.EQUIPMENT_INSTANCES eq
            , V_EQ_STATES level5s
            , OMEADMIN.DEF_STANDARD_STATEMODEL level5
            , OMEADMIN.DEF_STANDARD_STATEMODEL level4
            , OMEADMIN.DEF_STANDARD_STATEMODEL level3
            , OMEDATA.METAKEY_HISTORY mh

            WHERE data.eq_ident  = eq.eq_ident
            AND  data.eq_status = level5s.state_ident(+)
            AND level5.state_ident = data.tr25_5_status
            AND level4.state_ident = data.tr25_4_status
            AND level3.state_ident = data.tr25_3_status
            AND  data.metakey_ident =mh.ident(+)
            and data.timestamp_start > sysdate - 1050)
            where eq_id = '{eq_id}'
            ORDER BY TIMESTAMP_START"""

    status = pd.read_sql(query, engine)
    status.columns = map(lambda x: str(x).upper(), status.columns) 

    return status

def aggregate(timeframe_table, lookback_window, status_table):
    statename_df = pd.DataFrame(columns=status_table["STATE_NAME"].unique())

    for idx, row in timeframe_table.iterrows():
        end = row["TIMESTAMP"]
        start = end - timedelta(hours=lookback_window)

        ## count the frequencies of each statename, include everything since feature engineering would be performed
        filtered_statename = status_table.loc[(status_table["TIMESTAMP_START"] >= start) & 
                                              (status_table["TIMESTAMP_START"] <= end)]
        unique = filtered_statename["STATE_NAME"].unique()
        status_dict = {key:int(sum(filtered_statename.loc[filtered_statename.STATE_NAME==key]["DURATION"])) 
                       for key in unique}
        
        statename_df = statename_df.append(status_dict, ignore_index=True)
            
    statename_df = statename_df.fillna(0)
    cols = statename_df.columns
    statename_df[cols] = statename_df[cols].astype('int')
    return statename_df


def status_sequence(input_table, status_table, hour, scaled=False):
        status_seq = []
        duration_seq = []
        
        # validation check
        if status_table.iloc[0]["TIMESTAMP_START"] > input_table.iloc[0]["TIMESTAMP"]:
            raise Exception("Timeframe table must be a subset of the status table")
        if status_table.iloc[len(status_table)-1]["TIMESTAMP_START"] <= input_table.iloc[len(input_table)-1]["TIMESTAMP"]:
                raise Exception("Timeframe table must be a subset of the status table")
        
        for idx, row in input_table.iterrows():
            end = row["TIMESTAMP"]
            start = end - timedelta(hours=hour)
            
            condition = (status_table["TIMESTAMP_START"]>=start) & (status_table["TIMESTAMP_START"]<=end)

            table = status_table[condition]
            status_seq.append(table["STATE_NAME"].values)
            if scaled:
                duration_seq.append(table["SCALED_DURATION"].values)
            else:
                duration_seq.append(table["DURATION"].values)

        return status_seq, duration_seq


def major_down(input_df, status_table, hour, threshold): 
        hour = pd.Timedelta(hours=hour)
        major_down = []

        for idx, row in input_df.iterrows():
            start = row['TIMESTAMP']
            end = start+hour
            frame = status_table[(status_table['TIMESTAMP_START']>start) & (status_table['TIMESTAMP_START']<end)]
            UD = frame.loc[frame['LEVEL3']=='UDT']
            
            # disregard "waiting" in statename

            if len(UD) == 0: #no record within this 6 hours:
                major_down.append(0)
            else:
                time_diff = (UD['TIMESTAMP_END']-UD['TIMESTAMP_START']).dt.seconds
                if any(time_diff>=threshold): #threshold = 3600s
                    major_down.append(1)
                else:
                    major_down.append(0)
        return np.array(major_down)

def query_CAMSTAR(eq_id):
    try:
        oracle_string = "oracle+cx_oracle://{username}:{password}@{hostname}:{port}/{database}"
        engine = create_engine(
            oracle_string.format(
                username = 'bth_odsprod',
                password = 'bth_odsprodbth',
                hostname = 'odsprod-db.bth.infineon.com',
                port = '1523',
                database = 'odsprod'
                )
            )
    except Exception as e:
        print(str(e))

    query = f"""select EQUIPMENTNAME AS EQ_ID, TRACKINTIMESTAMP, TRACKOUTTIMESTAMP from A_WIPEQUIPMENTHISTORY t
                where t.equipmentname = '{eq_id}'
                ORDER BY TRACKINTIMESTAMP"""

    status = pd.read_sql(query, engine)

    return status

def label_encode(statename_seq): # do this the manual way as we are not certain if sklearn LabelEncoder can handle 3D array
    all_unique_statename = [set(ele) for ele in statename_seq]
    unique_statenames = set()
    for ele in all_unique_statename:
        unique_statenames |= ele
    
    enc_label = 1  #start encoding from 1 as we have to pad the sequence with 0
    mapping_dict = {}
    for ele in unique_statenames:
        mapping_dict[ele] = enc_label
        enc_label += 1

    enc_array = []
    #X_seq is a 3D array
    for timestamp in statename_seq:
        tmp_arr = []
        for ele in timestamp:
            tmp_arr.append(mapping_dict[ele])
        enc_array.append(np.array(tmp_arr))

    return np.array(enc_array), len(unique_statenames)+1, mapping_dict

In [None]:
# EDA for WBA124
wba124_status = query_status("WBA124")

In [None]:
##### Baseline model on uncleaned raw data #####

wba124_initial = wba124_status.copy()
hour = 24
beginning_major_down = []

for idx, row in wba124_initial.iterrows(): 
    start = row['TIMESTAMP_START']
    end = start+timedelta(hours=hour)
    frame = wba124_initial[(wba124_initial['TIMESTAMP_START']>start) & \
                                 (wba124_initial['TIMESTAMP_START']<end)]
    UD = frame.loc[frame['LEVEL3']=='UDT']

    if len(UD) == 0: #no record within this 6 hours:
        beginning_major_down.append(0)
    else:
        time_diff = (UD['TIMESTAMP_END']-UD['TIMESTAMP_START']).dt.seconds
        if any(time_diff>=threshold): #threshold = 3600s
            beginning_major_down.append(1)
        else:
            beginning_major_down.append(0)

In [None]:
wba124_initial["24 HOUR DOWN"] = beginning_major_down

In [None]:
##### Directly train on the raw data to test performance #####
not_feature = ['TIMESTAMP_START', 'TIMESTAMP_END', 'EQ_ID', 'LEVEL3_NAME', 'LEVEL3', 'STATE_NAME']

lb = LabelEncoder()
wba124_initial['ENC STATE NAME'] = lb.fit_transform(wba124_initial['STATE_NAME'])

tmp_table = wba124_initial.drop(not_feature, axis=1)

df = tmp_table[['DURATION', 'ENC STATE NAME']]
target = tmp_table['24 HOUR DOWN'].values

In [None]:
train_idx = int(0.7*len(df))
val_idx = int(0.8*(len(df)))

X_train, y_train = df[:train_idx], target[:train_idx]
X_val, y_val = df[train_idx:val_idx], target[train_idx:val_idx]
X_test, y_test = df[val_idx:], target[val_idx:]

clf = lightgbm.LGBMClassifier(
    boosting_type='gbdt',
    objective='binary',
    random_state=42,
    class_weight='balanced',
    reg_lambda=0.01
            )

clf.fit(X_train, y_train,
       eval_set=(X_val, y_val),
       eval_metric='f1',
        verbose=True)

pred = clf.predict(X_test)

confusion_matrix(y_test, pred), accuracy_score(y_test, pred)

In [None]:
##### Model after Feature Engineering #####

In [None]:
wba124_duration_sort = wba124_status.sort_values('DURATION', ascending=False).head(10)
wba124_sorted_duration_statename_val = wba124_duration_sort["STATE_NAME"].values
wba124_duration_sort

In [None]:
# check these rows, how many transactions are in the middle
for idx in wba124_duration_sort.index:
    start = wba124_status.iloc[idx]["TIMESTAMP_START"]
    end = wba124_status.iloc[idx]["TIMESTAMP_END"]
    filtered = wba124_status.loc[(wba124_status.TIMESTAMP_START>=start) & 
                                (wba124_status.TIMESTAMP_START<end)]
    print(f'In between timestamp of row {idx}, there are {len(filtered)-1} transactions')
    
# for status with other transaction in the middle, should they be invalid

In [None]:
# notice how 2021-04-22 has the longest duration, for WBA127 & WBA124 likely for other machines as well
# someone closed all the status on this day
# remove these data points

apr = datetime(2021,4,22)
end = apr + timedelta(days=1)
wba124_status[(wba124_status["TIMESTAMP_END"]>=apr)&
               (wba124_status["TIMESTAMP_END"]<=end)].sort_values('DURATION', ascending=False).head(10)

In [None]:
# check for 0 duration
wba124_duration_sort_asc = wba124_status.sort_values('DURATION')
zero_seconds = wba124_status.loc[wba124_status.DURATION==0.0]
print(f'There are {len(zero_seconds)} rows with 0.0 seconds as DURATION')
print(f'Distribution of LEVEL3:\n{zero_seconds["LEVEL3"].value_counts()}')
wba124_duration_sort_asc.head(10)

In [None]:
# Remove one row of negative DURATION 
check_duration = wba124_status.loc[wba124_status.DURATION<0.0]
print(len(check_duration))
wba124_status = wba124_status.drop(index=5195, axis=0)
check_duration1 = wba124_status.loc[wba124_status.DURATION<0.0]
len(check_duration1) == 0

In [None]:
wba124_status = wba124_status.reset_index(drop=True) # always reset index after you drop

In [None]:
# apparently status can be uncontinuous, there might be some transaction in between timestamp of each row
# find rows which the timestamp end is later than the next timestamp start
# to prove all those with status in the middle are recorded wrongly

# also find out rows with the same state name and is split into 2

idx_row = []
same_state_idx = []
for idx, row in wba124_status.iterrows():
    if idx == len(wba124_status)-1:
        break
    end = row["TIMESTAMP_END"]
    state1 = row["STATE_NAME"]
    next_row = wba124_status.iloc[idx+1]
    next_start = next_row["TIMESTAMP_START"]
    state2 = next_row["STATE_NAME"]
    
    # timestamp end of this row later than timestamp_start of next row
    if end > next_start:
        idx_row.append(idx)
        
    # same state name but broken into 2 records
    if end == next_start and state1 == state2:
        same_state_idx.append(idx)
    
print(f'Rows with overlapping transaction {idx_row}')
print(f'There are {len(same_state_idx)} rows with the same state name as the next row with the same timestamp')

In [None]:
# this block is to check that the index with overlapping rows are correct
idx = 6058
wba124_status.iloc[idx:idx+2]

In [None]:
# clean up status data 
# (do not remove special level3 as sometimes engineers would run dummy lot and CAMSTAR will not have record but TFM has)

# 1. combine timestamp_end == next row timestamp_start, start from the back as there are some with continuous rows
wba124_clean_status = wba124_status.copy()
for idx in reversed(same_state_idx):
    duration = sum(wba124_clean_status.iloc[idx:idx+2]["DURATION"])
    wba124_clean_status.at[idx, 'DURATION'] = duration
    wba124_clean_status.at[idx, 'TIMESTAMP_END'] = wba124_clean_status.iloc[idx+1]["TIMESTAMP_END"]
    wba124_clean_status.drop(index=idx+1, axis=0, inplace=True)

# validate all timestamp_end == timestamp start with same state name datapoints have been combined
idx_rows = []
same_state = []
wba124_clean_status = wba124_clean_status.reset_index(drop=True)
for idx, row in wba124_clean_status.iterrows():
    if idx == len(wba124_clean_status)-1:
        break
    end = row["TIMESTAMP_END"]
    state1 = row["STATE_NAME"]
    next_row = wba124_clean_status.iloc[idx+1]
    next_start = next_row["TIMESTAMP_START"]
    state2 = next_row["STATE_NAME"]
    
    if end > next_start:
        idx_rows.append(idx)
    if end == next_start and state1 == state2:
        same_state.append(idx)
    
print(f'Rows with overlapping transaction {idx_rows}')
print(f'There are {len(same_state)} rows with the same state name as the next row with the same timestamp')

In [None]:
# 2. Remove duration = 0
len1 = len(wba124_clean_status)
wba124_clean_status = wba124_clean_status[~(wba124_clean_status.DURATION==0.0)]
len2 = len(wba124_clean_status)
print(f'Removed {len1-len2} rows of data with duration as 0.0')

In [None]:
# 3. Remove the rows with overlapping time frame, either due to long duration or wrong record4
len3 = len(wba124_clean_status)
wba124_clean_status = wba124_clean_status.drop(index=idx_rows, axis=0)
len4 = len(wba124_clean_status)
print(f'Removed {len3-len4} rows of data with long duration (overlapping timestamp start and end)')

In [None]:
# 4. Check again and make sure no overlapping timestamp (all timestamp_start must be >= last timestamp_end)
idx_rows_check = []
same_state_check = []
wba124_clean_status = wba124_clean_status.reset_index(drop=True)
for idx, row in wba124_clean_status.iterrows():
    
    if idx == len(wba124_clean_status)-1:
        break
    end = row["TIMESTAMP_END"]
    state1 = row["STATE_NAME"]
    next_row = wba124_clean_status.iloc[idx+1]
    next_start = next_row["TIMESTAMP_START"]
    state2 = next_row["STATE_NAME"]
    
    if end > next_start:
        idx_rows_check.append(idx)
    if end == next_start and state1 == state2:
        same_state_check.append(idx)
    
print(f'Rows with overlapping transaction {idx_rows_check}')
print(f'There are {len(same_state_check)} rows with the same state name as the next row with the same timestamp')

In [None]:
print(f'Removed a total of {len(wba124_status)-len(wba124_clean_status)} rows from {len(wba124_status)}.')

wba124_clean_status = wba124_clean_status.sort_values('TIMESTAMP_START', ascending=True) # rearrange just in case

In [None]:
# compute major down and check the relationship between alarms using histogram (split into class 0 and class 1)
wba124_timeframe_table = generate_time('5/12/2018', '31/8/2021', 24)
wba124_major_down_arr = major_down(wba124_timeframe_table, wba124_clean_status, 24, 3600)
wba124_timeframe_table["MAJOR DOWN"] = wba124_major_down_arr

In [None]:
# major down validation check, just make sure at least one row will show here
timestamp_check = wba124_timeframe_table[wba124_timeframe_table["MAJOR DOWN"]==1].iloc[250]["TIMESTAMP"]
print(timestamp_check)
timestampend_check = timestamp_check + timedelta(hours=24)
filtered = wba124_clean_status[(wba124_clean_status["TIMESTAMP_START"]>=timestamp_check) &
             (wba124_clean_status["TIMESTAMP_START"]<=timestampend_check)]
filtered.loc[(filtered.LEVEL3 == "UDT") & (filtered.DURATION >= 3600)]

In [None]:
# plot histogram to see relationship between state name and major down
statename_seq, duration_seq = status_sequence(wba124_timeframe_table, wba124_clean_status, 24)

wba124_timeframe_table["STATENAME SEQ"] = statename_seq
wba124_timeframe_table["DURATION SEQ"] = duration_seq

positive = wba124_timeframe_table.loc[wba124_timeframe_table["MAJOR DOWN"]==1]
negative = wba124_timeframe_table.loc[wba124_timeframe_table["MAJOR DOWN"]==0]

statename_pos = positive["STATENAME SEQ"].values
statename_neg = negative["STATENAME SEQ"].values
duration_pos = positive["DURATION SEQ"].values
duration_neg = negative["DURATION SEQ"].values

pos = []
dur_pos = []
neg = []
dur_neg = []
for state_p, dur_p in zip(statename_pos, duration_pos):
    pos.extend(state_p)
    dur_pos.extend(dur_p)

for state_n, dur_n in zip(statename_neg, duration_neg):
    neg.extend(state_n)
    dur_neg.extend(dur_n)

In [None]:
# data validation check
print("negative length = ", len(neg))
print("positive length = ", len(pos))
print(f'Total number of data collected = {len(neg) + len(pos)}') 

# the 3 extra collections is because 3 rows of data in clean_status just so happen to start at midnight
aaa = wba124_timeframe_table.iloc[0]["TIMESTAMP"] - timedelta(hours=24)
bbb = wba124_timeframe_table.iloc[len(wba124_timeframe_table)-1]["TIMESTAMP"]
ccc = wba124_clean_status[(wba124_clean_status["TIMESTAMP_START"]>=aaa) & (wba124_clean_status["TIMESTAMP_START"]<=bbb)]
timestamps = wba124_timeframe_table["TIMESTAMP"].values
print(f'Total length of clean status table = {len(ccc)}')
wba124_clean_status[wba124_clean_status["TIMESTAMP_START"].isin(timestamps)]

In [None]:
# notice that the top few state name frequencies are very similar for both machines
count_helper = Counter(pos)
NP_pos_count = count_helper.pop('Normal Production') # remove normal production to or else cannot visualize the graph

pos_sort_count = {}
for key, val in sorted(count_helper.items(), key=lambda x: x[1], reverse=True):
    if key is not None: # putting in 'Exception' LEVEL3 will cause matplotlib error
        pos_sort_count[key] = val

plt.figure(figsize=(16,9))
plt.bar(pos_sort_count.keys(), pos_sort_count.values(), width=0.4)
plt.title("Positive class 24 hours state name distribution")
plt.xticks(rotation=90)
plt.show()

neg_count_helper = Counter(neg)

NP_neg_count = neg_count_helper.pop('Normal Production') # remove normal production to or else cannot visualize the graph
Level3_Exception_count = neg_count_helper.pop(None) # Exception all happen during positive class

neg_sort_count = {}
for key, val in sorted(neg_count_helper.items(), key=lambda x: x[1], reverse=True):
    if key is not None:
        neg_sort_count[key] = val

plt.figure(figsize=(16,9))
plt.bar(neg_sort_count.keys(), neg_sort_count.values(), width=0.4)
plt.title("Negative class 24 hours state name distribution")
plt.xticks(rotation=90)
plt.show()

In [None]:
# remove rare occurence item, probably not helpful
print('State name that only appears in positive class = \n',
      [[ele, val] for ele, val in pos_sort_count.items() if ele not in neg_sort_count.keys()])
print('\nOnly in negative class = \n', [[ele, val] for ele, val in neg_sort_count.items() if ele not in pos_sort_count.keys()])

# check on other equipment as well, keep rare STATE_NAME

In [None]:
# check for DURATION outliers
# still about 5 spots standing out, investigate
wba124_clean_status.plot(x='TIMESTAMP_START', y='DURATION', marker='o', linestyle='none')

In [None]:
outlier_dur = wba124_clean_status.sort_values('DURATION', ascending=False).head(5)
outlier_dur

In [None]:
# check if there is any major down 24 hours after these values
check_date = outlier_dur.iloc[0]['TIMESTAMP_END']
end = check_date + timedelta(hours=24)
wba124_clean_status[(wba124_clean_status['TIMESTAMP_START']>=check_date) &
                   (wba124_clean_status['TIMESTAMP_START']<=end) &
                   (wba124_clean_status['LEVEL3'] == 'UDT') &
                   (wba124_clean_status['DURATION']>=3600)]

# found out that for majority of the time, give long duration, high chance a major down would occur within 24 hours
# keep the outliers

In [None]:
##### collect major down at each row #####
hour = 24
threshold = 3600
lookback_window = 30

# take out the last 2 days in order to collect major down correctly
last_date = wba124_clean_status.iloc[len(wba124_clean_status)-1]["TIMESTAMP_START"]
two_days = last_date - timedelta(days=2)
walk_forward_dataset = wba124_clean_status[wba124_clean_status["TIMESTAMP_START"]<two_days].reset_index(drop=True)

major_down = []
for idx, row in walk_forward_dataset.iterrows(): 
    start = row['TIMESTAMP_START']
    end = start+timedelta(hours=hour)
    frame = wba124_clean_status[(wba124_clean_status['TIMESTAMP_START']>start) & \
                                 (wba124_clean_status['TIMESTAMP_START']<end)]
    UD = frame.loc[frame['LEVEL3']=='UDT']

    if len(UD) == 0: #no record within this 6 hours:
        major_down.append(0)
    else:
        time_diff = (UD['TIMESTAMP_END']-UD['TIMESTAMP_START']).dt.seconds
        if any(time_diff>=threshold): #threshold = 3600s
            major_down.append(1)
        else:
            major_down.append(0)
    
walk_forward_dataset["24 HOUR DOWN"] = major_down

In [None]:
walk_forward_dataset

In [None]:
##### Baseline model 2 on cleaned removed data #####

not_feature = ['TIMESTAMP_START', 'TIMESTAMP_END', 'EQ_ID', 'LEVEL3_NAME', 'LEVEL3', 'STATE_NAME']

lb = LabelEncoder()
walk_forward_dataset['ENC STATE NAME'] = lb.fit_transform(walk_forward_dataset['STATE_NAME'])

tmp_table = walk_forward_dataset.drop(not_feature, axis=1)

df = tmp_table[['DURATION', 'ENC STATE NAME']]
target = tmp_table['24 HOUR DOWN'].values

In [None]:
train_idx = int(0.7*len(df))
val_idx = int(0.8*(len(df)))

X_train, y_train = df[:train_idx], target[:train_idx]
X_val, y_val = df[train_idx:val_idx], target[train_idx:val_idx]
X_test, y_test = df[val_idx:], target[val_idx:]

clf = lightgbm.LGBMClassifier(
    boosting_type='gbdt',
    objective='binary',
    random_state=42,
    class_weight='balanced',
    reg_lambda=0.01
            )

clf.fit(X_train, y_train,
       eval_set=(X_val, y_val),
       eval_metric='auc',
        verbose=True)

pred = clf.predict(X_test)

confusion_matrix(y_test, pred), accuracy_score(y_test, pred)

In [None]:
##### TIME SINCE FAILURE #####
major_down_idx = walk_forward_dataset[(walk_forward_dataset["LEVEL3"]=='UDT') &
                                     (walk_forward_dataset["DURATION"] >= 3600)].index.to_list()

# there are 515 transaction of major down
tsf =[]
tsf_df = walk_forward_dataset.iloc[major_down_idx[0]:major_down_idx[-1]+1]
count = 0

for idx, row in tsf_df.iterrows():
    if idx in major_down_idx:
        count = major_down_idx.index(idx)
        tsf.append(0)
        continue
        
    last_down_time = tsf_df.iloc[count]["TIMESTAMP_END"]
    curr_time = row["TIMESTAMP_START"]
    diff = (curr_time - last_down_time).seconds
    tsf.append(diff)
tsf_df["TIME SINCE FAILURE"] = tsf

In [None]:
tsf_df['TIME SINCE FAILURE'][:900].plot()

In [None]:
lb = LabelEncoder()
enc_state = lb.fit_transform(tsf_df['STATE_NAME'])
tsf_df['ENC STATE NAME'] = enc_state

In [None]:
##### Lag DURATION and STATE NAME feature #####
lags = [1,2,3,4,5]
for lag in lags:
    tsf_df[f'DURATION LAGGED {lag}'] = tsf_df['DURATION'].shift(lag)
    tsf_df[f'ENC STATE NAME LAGGED {lag}'] = tsf_df['ENC STATE NAME'].shift(lag)

In [None]:
##### Rolling aggregation of DURATION and TIME SINCE FAILURE #####
rolling = [5, 10, 15, 20]
for roll in rolling:
    tsf_df[f'DURATION ROLL {roll}'] = tsf_df['DURATION'].rolling(roll).mean()
    tsf_df[f'TSF ROLL {roll}'] = tsf_df['TIME SINCE FAILURE'].rolling(roll).mean()

In [None]:
##### calculate number of occurrence of each state name within this day #####
tsf_df = tsf_df.dropna(axis=0)

tsf_df['DATE'] = tsf_df['TIMESTAMP_START'].dt.date
tsf_df['DAY'] = tsf_df['TIMESTAMP_START'].dt.day
tsf_df['WEEK'] = tsf_df['TIMESTAMP_START'].dt.isocalendar()['week']
tsf_df['MONTH'] = tsf_df['TIMESTAMP_START'].dt.month
tsf_df['YEAR'] = tsf_df['TIMESTAMP_START'].dt.isocalendar()['year']

day_gb = tsf_df.groupby(['DATE', 'STATE_NAME']).count()['EQ_ID']
week_gb = tsf_df.groupby(['YEAR', 'WEEK', 'STATE_NAME']).count()['EQ_ID']
month_gb = tsf_df.groupby(['YEAR', 'MONTH', 'STATE_NAME']).count()['EQ_ID']

freq_day = []
freq_week = []
freq_month = []

for idx, row in tsf_df.iterrows():
    state_name = row['STATE_NAME']
    date = row['DATE']
    week = row['WEEK']
    month = row['MONTH']
    year = row['YEAR']
    
    FREQ_DAY = day_gb.iloc[(day_gb.index.get_level_values('STATE_NAME')==state_name) & 
                        (day_gb.index.get_level_values('DATE')==date)].values[0]
    
    FREQ_WEEK = week_gb.iloc[(week_gb.index.get_level_values('STATE_NAME')==state_name) & 
                         (week_gb.index.get_level_values('YEAR')==year) &
                         (week_gb.index.get_level_values('WEEK')==week)].values[0]
    
    FREQ_MONTH = month_gb.iloc[(month_gb.index.get_level_values('STATE_NAME')==state_name) & 
                          (month_gb.index.get_level_values('YEAR')==year) &
                          (month_gb.index.get_level_values('MONTH')==month)].values[0]
    
    freq_day.append(FREQ_DAY)
    freq_week.append(FREQ_WEEK)
    freq_month.append(FREQ_MONTH)

tsf_df['FREQ DAY'] = freq_day
tsf_df['FREQ WEEK'] = freq_week
tsf_df['FREQ MONTH'] = freq_month

In [None]:
tsf_df.info()

In [None]:
not_features = ['DATE','TIMESTAMP_START', 'TIMESTAMP_END', 'EQ_ID',
               'LEVEL3_NAME', 'LEVEL3', 'STATE_NAME', 'DATE', '24 HOUR DOWN',
               'DATE', 'YEAR', 'MONTH', 'WEEK', 'DAY',
               'FREQ DAY', 'FREQ WEEK', 'FREQ MONTH']

df = tsf_df.drop(not_features, axis=1)
target = tsf_df['24 HOUR DOWN']

In [None]:
train_idx = int(0.7*len(df))
val_idx = int(0.8*(len(df)))

X_train, y_train = df[:train_idx], target[:train_idx]
X_val, y_val = df[train_idx:val_idx], target[train_idx:val_idx]
X_test, y_test = df[val_idx:], target[val_idx:]

clf = lightgbm.LGBMClassifier(
    boosting_type='gbdt',
    objective='binary',
    random_state=42,
    learning_rate=0.05,
    class_weight='balanced',
    reg_lambda=0.01
            )

clf.fit(X_train, y_train,
       eval_set=(X_val, y_val),
       eval_metric='AUC',
        verbose=True)

In [None]:
print(df.columns)
clf.feature_importances_

In [None]:
pred = clf.predict(X_test)

confusion_matrix(y_test, pred), accuracy_score(y_test, pred)
# without class weight 57
# with class weight 56, but recall improves 

In [None]:
X_train

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
pred = rf.predict(X_test)
confusion_matrix(y_test, pred), accuracy_score(y_test, pred)

In [None]:
# use sample weights based on important state name
# sample weight is the inverse of its frequency
wba124_initial['STATE_NAME'].unique()


In [None]:
waitings = ['Waiting For Response', 'Waiting For Operator', 'Waiting For Repair', 'Waiting For Technician',
           'Waiting For Spares', 'Waiting for Setup', 'Undefined Waiting']

aaaaa = wba124_initial.loc[(wba124_initial.STATE_NAME.isin(waitings)) &
              (wba124_initial.DURATION>=3600)].sort_values('DURATION',ascending=False)

aaaaa.groupby('STATE_NAME').size()

In [None]:
bbbbb = wba124_initial.loc[(wba124_initial.STATE_NAME.isin(waitings))]
bbbbb.groupby('STATE_NAME').size()

In [None]:
aaaaa.loc[aaaaa.STATE_NAME==waitings[2]].sort_values('TIMESTAMP_START').head(50)