In [None]:
import pandas as pd
from collections import Counter
import numpy as np

# base_df = pd.read_csv("fake_data.csv", encoding_errors="ignore", on_bad_lines='skip', sep=",",)
base_df = pd.read_csv("BPI2016_Clicks_Logged_In.csv", encoding_errors="ignore", on_bad_lines='skip', sep=";",
                usecols=['CustomerID', 'AgeCategory', 'Gender', 'Office_U', 'Office_W',
       'SessionID', 'IPID', 'TIMESTAMP', 'VHOST', 'URL_FILE', 'PAGE_NAME',
       'REF_URL_category', 'page_load_error', 'page_action_detail', 'tip',
       'service_detail', 'xps_info'])
base_df.rename(columns={'URL_FILE':'Activity'}, inplace=True)
base_df["TIMESTAMP"] = pd.to_datetime(base_df["TIMESTAMP"], infer_datetime_format=True)
base_df = base_df.sort_values(["SessionID", "TIMESTAMP"])

In [None]:
# Helper functions
def create_df_for_pivot(df: pd.DataFrame):
  df = df.copy()
  #Create consecutive column with the consecutive activity and count how many time each pair occurs
  df["Consecutive_1"] = df.groupby("SessionID")["Activity"].shift(periods=-1)

  #Count all the pairs and safe in new DataFrame
  df_for_pivot_1 = pd.DataFrame(Counter(list(zip(df['Activity'], df['Consecutive_1'],))), index=["value"]).T.reset_index()

  df_for_pivot_1.fillna("end_session", inplace=True)
  return df_for_pivot_1
  
def make_pivot(df, index_names, column_names):
  df = df.copy()
  """
  Functions that takes in a DataFrame and returns a pivot table with all the chances
  
  Create pivot table where chances are calculated that each row is succeded by activity that is represented in the column
  """
  df_chances = df.pivot_table(index=index_names, columns=column_names, values='value')
  
  #Calculate what the probabilities are by summing the row and dividing all the values in the row by total sum of the row
  df_chances["total_row_count"] = df_chances.sum(axis=1)
  df_chances = df_chances.div(df_chances["total_row_count"], axis=0)
  
  df_chances.fillna(0, inplace=True)
  df_chances.drop("total_row_count", axis=1, inplace=True)

  return df_chances

  
# Create df for pivot
df_for_pivot = create_df_for_pivot(base_df)
df_chances_1 = make_pivot(df_for_pivot, "level_0", "level_1")

#Calculate the probability of each activity that it is the first activity performed in the session
df_chances_1["start_session_chance"] = base_df.groupby("SessionID").nth(0)["Activity"].value_counts() / base_df.groupby("SessionID").nth(0)["Activity"].value_counts().sum()
df_chances_1.fillna(0, inplace=True)

df_chances_total = df_chances_1 
df_chances_total

#Split the start probabilities from the normal DataFrame
start_chances = df_chances_total["start_session_chance"][df_chances_total["start_session_chance"] > 0]

#Create df with all chances except the starting chance.
final_df = df_chances_total.iloc[:, :-1]
final_df

In [None]:
def skip_event(_df:pd.DataFrame,event:str,deviation_rate):
  df = _df.copy()
  # print(f"Decrease the probability to reach event: {event} with deviation rate: {deviation_rate}")
  old_probability = df.loc[:,event]
  deviation_rate = deviation_rate / 100
  
  # decrease the probability
  for i,prob in old_probability.items():
    # skip row of event
    if i == event: continue
    
    # start changing the prob 
    if prob > 0:
      decrease = prob * (1-deviation_rate)
      df.loc[i,event] -= prob - decrease      
      for _i,_prob in df.loc[i,:].items():
        df.loc[i,_i] += (prob - decrease) * df.loc[event,_i]
  
  
  # change row of event
  if df.loc[event,event] > 0:
    decrease = df.loc[event,event] * (1 - deviation_rate)
    original_value = df.loc[event,event]
    count = df.loc[event]
    count = count[count > 0].__len__() - 1
    
    for i,prob in  df.loc[event,:].items():
      if df.loc[event,i] > 0:
        # Circle case
        if i == event: 
          df.loc[event,i] -= prob - decrease
        else: df.loc[event,i] += (original_value - decrease) / count 
      
  return df

In [None]:
def skip_multiple_events(list,prop_matrix):
  res = []
  for item in list:
    new_df = skip_event(prop_matrix,item[0],item[1])
    res.append({"new_df":new_df,
                "activity": item[0],
                "dev":item[1]})
  return res
    

In [None]:
# events_to_skip = [('Accept_offer',10)]
events_to_skip = [('/werk_nl/werknemer/mijn_werkmap/doorgeven/taken',50),
                  ('/werk_nl/werknemer/mijn_werkmap/werk-zoeken/vacatures_bij_mijn_cv',50),
                  ('/werk_nl/werknemer/mijn_werkmap/werk-zoeken/mijn_cv',50)]
skipped = skip_multiple_events(events_to_skip,final_df)


In [None]:
def generate_markov_chain(amount:int,start_chances,df_dict:pd.DataFrame) -> list:
  generated_sessions = []
  while len(generated_sessions) < amount:
    new_session = []
    # choose start activity
    start_activity = np.random.choice(a = np.array(start_chances.index), size = 1, p = np.array(start_chances.values))
    # start_activity = np.random.choice(a = np.array(df_dict.iloc[0,:].index), size = 1, p = np.array(df_dict.iloc[0,:].values))
    new_session.append(start_activity[0])

    
    while new_session[-1] != 'end_session':
    # while new_session[-1] != 'E':
      curr_activity = new_session[-1]
      row_activity = df_dict.loc[curr_activity,:]
      activity = np.random.choice(a = list(row_activity.keys()), size = 1, p = list(row_activity.values))
      new_session.append(activity[0])
    
    generated_sessions.append(new_session)        
    
  
  return generated_sessions

In [None]:
def generate_sessions(skipping_list:list,total_amount:int):
  session_per_activity = total_amount / len(skipping_list)
  sessions = []
  for item in skipping_list:
    markov = generate_markov_chain(session_per_activity,start_chances,item["new_df"])
    sessions += markov
  return sessions
  

markov_result = generate_sessions(skipped,2500)
# res
print(len(markov_result))

In [None]:
def markov_result_to_df(list_markov_result, name_index, name_csv):
    activity_list_final = []
    session_id_list = []
    for nbr in range(len(markov_result)):
        string_generated = name_index + str(nbr)
        for i in markov_result[nbr]:
            if i != "end_session":
                activity_list_final.append(i)
                session_id_list.append(string_generated)
                
    df_generated = pd.DataFrame(list(zip(session_id_list, activity_list_final)),
               columns =['SessionID', 'URL_FILE'])
    df_generated.to_csv(name_csv)
    
markov_result_to_df(markov_result, "generated_top3_50%_", "gen_sessions/skipped/generated_withCircle_top3_50%_2500.csv")