# Random Forest Models (Naive and Sliding Window Training)

In [8]:
#export
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from ipynb.fs.full.PPP_BasicModel import PPModel

In [9]:
#export
class RandomForest_PPM(PPModel):
    def __init__(self):
        self.n_estimators = 10
        self.clf1 = RandomForestClassifier(n_estimators=self.n_estimators)
        self.clf2 = RandomForestClassifier(n_estimators=self.n_estimators)
        self.clf3 = RandomForestClassifier(n_estimators=self.n_estimators)
        self.clf4 = RandomForestClassifier(n_estimators=self.n_estimators)
        #self.clf5 = RandomForestRegressor(n_estimators=self.n_estimators)
        #self.clf6 = RandomForestClassifier(n_estimators=self.n_estimators)
        
    def create_traces(self, event_df, trace_id='trace_id'):
        ll=[]
        trace_ids=[]
        cols=list(event_df)
        cols.remove(trace_id)
        for n, g in event_df.groupby(trace_id): #progress_bar()
            l=[]
            for c in cols:
                l.append(list(g[c]))
            ll.append(l)
            trace_ids.append(n)  
        df=pd.DataFrame(ll,columns=cols)
        df["trace_id"] = trace_ids
        return df
    
    def drop_short_cases(self, traces, col, index=2):
        return traces[traces[col].map(len) > (index+1)].reset_index(drop=True)
    
    def build_windows(self, traces, ws, col, index=2):
        val = traces[col].values
        X = []
        
        for i in range(len(traces)):
            for j in range(len(val[i])):
                if j+ws >= len(val[i]) or j < ws-1: #j+ws+1, ws-1
                    continue
                else:
                    windows = []
                    windows.append(traces["trace_id"][i]) # trace_id
                    windows.append(val[i][j:ws+j])        # windows
                    windows.append(val[i][j])             # a1
                    windows.append(val[i][j+1])           # a2
                    windows.append(val[i][-1])            # last activity/resource = outcome
                    windows.append(val[i][j+ws])          # desired prediction = preds
                    X.append(windows)
                    
        new_df = pd.DataFrame(X, columns=["trace_id", "windows", "a1", "a2", "outcome", "preds"]) #, "duration"
        return new_df
        
    def train(self, data):
        train_traces = self.create_traces(data)
        short_train = self.drop_short_cases(train_traces, "activity")
        window_df1 = self.build_windows(short_train, 2, "activity")
        window_df2 = self.build_windows(short_train, 2, "resource")
        
        train1 = window_df1[["a1", "a2"]]   # sliding window next step training
        target1 = window_df1["preds"]
        self.clf1.fit(train1, target1)
        
        train2 = window_df2[["a1", "a2"]]   # sliding window next resource training
        target2 = window_df2["preds"]
        self.clf2.fit(train2, target2)
                
        target3 = window_df1["outcome"]     # sliding window outcome training
        self.clf3.fit(train1, target3)
        
        target4 = window_df2["outcome"]     # sliding window last resource training
        self.clf4.fit(train2, target4)
        
        return self.clf1, self.clf2, self.clf3, self.clf4

    def predict(self, test, index=2):
        test_traces = self.create_traces(test)
        short_test = self.drop_short_cases(test_traces, "activity")
        test_window1 = self.build_windows(short_test, 2, "activity")
        test_window2 = self.build_windows(short_test, 2, "resource")

        X1 = test_window1[["trace_id", "a1", "a2"]].groupby("trace_id") # nsp
        X2 = test_window2[["trace_id", "a1", "a2"]].groupby("trace_id") # nrp
        
        preds1, preds2, preds3, preds4 = [], [], [], []
        for i,j in X1:
            z = j[["a1", "a2"]]
            preds1.append(self.clf1.predict(z)) # nsp
            preds3.append(self.clf3.predict(z)) # op
        preds1 = pd.Series(preds1)
        preds3 = pd.Series(preds3)
        
        for k,l in X2:
            p = l[["a1", "a2"]]
            preds2.append(self.clf2.predict(p)) # nrp
            preds4.append(self.clf4.predict(p)) # lrp
        preds2 = pd.Series(preds2)
        preds4 = pd.Series(preds4)
            
        self.cases = pd.Series(X1.apply(list).index).to_frame() # get trace_ids
        self.cases["nsp"] = preds1
        self.cases["nrp"] = preds2
        self.cases["op"]  = preds3
        self.cases["lrp"] = preds4
        self.cases = self.cases.set_index("trace_id")
        return super().predict(test,index) 
    
    def name(self): return self.__class__.__name__
    
    def next_step_prediction(self, test, index=2): return self.cases["nsp"]
    
    def next_resource_prediction(self, test, index=2): return self.cases["nrp"]
    
    def last_resource_prediction(self, test, index=2): return self.cases["lrp"]
    
    def outcome_prediction(self, test, index=2): return self.cases["op"]
    
    def duration_to_next_event_prediction(self, test, index=2): return pd.Series(dtype=float)
    def duration_to_end_prediction(self, test, index=2):  return pd.Series(dtype=float)
    def activity_suffix_prediction(self, test, index=2):  return pd.Series(dtype=float)
    def resource_suffix_prediction(self, test, index=2):  return pd.Series(dtype=float)

In [10]:
#export
class SimpleRandomForest_PPM(PPModel):
    def __init__(self):
        self.n_estimators = 10 # min_sample_split und max_depth tunen???
        self.clf1 = RandomForestClassifier(n_estimators=self.n_estimators)
        self.clf2 = RandomForestClassifier(n_estimators=self.n_estimators)
        self.clf3 = RandomForestClassifier(n_estimators=self.n_estimators)
        self.clf4 = RandomForestClassifier(n_estimators=self.n_estimators)
        self.clf5 = RandomForestRegressor(n_estimators=self.n_estimators)
        self.clf6 = RandomForestRegressor(n_estimators=self.n_estimators)
        self.clf7 = RandomForestClassifier(n_estimators=self.n_estimators)
        
    def train(self, data):
        # make sure there are no NaN Values in data
        data["duration to next event"].fillna(0.0, inplace=True)
        data["time:timestamp"] = [i.astype("float") for i in data["time:timestamp"].values]
        
        train1 = data[["activity"]]                        # naives next activity training
        target1 = data["next activity"]
        self.clf1.fit(train1, target1)
        
        train2 = data[["resource", "activity"]]            # naives next resource training
        target2 = data["next resource"]
        self.clf2.fit(train2, target2)
        
        target3 = data["last resource"]                    # naives last resource training
        self.clf3.fit(train2, target3)
        
        target4 = data["outcome"]                          # naives outcome training
        self.clf4.fit(train1, target4)
        
        train3 = data[["activity", "time:timestamp"]]
        target5 = data["duration to next event"]           # naives duration to next event training
        self.clf5.fit(train3, target5)
        
        target6 = data["duration to end"]                  # naives duration to outcome training
        self.clf6.fit(train3, target6)
        
        target7 = data["activity suffix"]                  # naives activity suffix training
        self.clf7.fit(train1, target7)
        
        target8 = data["resource suffix"]                  # naives resource suffix training
        self.clf8.fit(train2, target8)
        return self.clf1, self.clf2, self.clf3, self.clf4, self.clf5, self.clf6, self.clf7, self.clf8
    
    def predict(self, test, index=2):
        self.result_df = test.copy()
        test["time:timestamp"] = [i.astype("float") for i in test["time:timestamp"].values]
        X1 = test[["activity"]] # nsp, op, asp
        X2 = test[["resource", "activity"]] # nrp, lrp, rsp
        X3 = test[["activity", "time:timestamp"]] # dtnep, dtep
        
        self.result_df['nsp'] = self.clf1.predict(X1)
        self.result_df['nrp'] = self.clf2.predict(X2)
        self.result_df['lrp']  = self.clf3.predict(X2)
        self.result_df['op'] = self.clf4.predict(X1)
        self.result_df['dtnep'] = self.clf5.predict(X3)
        self.result_df['dtep'] = self.clf6.predict(X3)
        self.result_df['asp'] = self.clf7.predict(X1)
        self.result_df['rsp'] = self.clf8.predict(X2)

        self.cases = self.result_df.groupby('trace_id')
        
        return super().predict(test,index) # self.cases returnen
    
    def name(self): 
        return self.__class__.__name__
    
    def next_step_prediction(self, test, index=2): return PPModel._return_results(self, index, "nsp")
    
    def next_resource_prediction(self, test, index=2): return PPModel._return_results(self, index, "nrp")
    
    def last_resource_prediction(self, test, index=2): return PPModel._return_results(self, index, "lrp")
    
    def outcome_prediction(self, test, index=2): return PPModel._return_results(self, index, "op")
    
    def duration_to_next_event_prediction(self, test, index=2): return PPModel._return_results(self, index, "dtnep")
    
    def duration_to_end_prediction(self, test, index=2):  return PPModel._return_results(self, index, "dtep")
    def activity_suffix_prediction(self, test, index=2):  return PPModel._return_results(self, index, "asp")
    def resource_suffix_prediction(self, test, index=2):  return PPModel._return_results(self, index, "rsp")

In [11]:
! /home/lahann/anaconda3/envs/fastpm/bin/python notebook2script.py 004_PPP_RandomForestModel.ipynb

Converted 004_PPP_RandomForestModel.ipynb to exp/RandomForestModel.py
