In [33]:
import pandas as pd
from collections import Counter
import numpy as np

In [34]:
# base_df = pd.read_csv("../no_repeated.csv", encoding_errors="ignore", on_bad_lines='skip', sep=",",)
base_df = pd.read_csv("../BPI2016_Clicks_Logged_In.csv", encoding_errors="ignore", on_bad_lines='skip', sep=";",usecols=['SessionID', 'TIMESTAMP', 'URL_FILE', ])
base_df.rename(columns={'URL_FILE':'Activity'}, inplace=True)
base_df["TIMESTAMP"] = pd.to_datetime(base_df["TIMESTAMP"], infer_datetime_format=True)
base_df = base_df.sort_values(["SessionID", "TIMESTAMP"])
base_df.head()

Unnamed: 0,SessionID,TIMESTAMP,Activity
3273278,46,2015-11-06 08:07:22.780,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken
5333642,46,2015-11-06 08:07:40.767,/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn...
3733243,46,2015-11-06 08:07:51.390,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_d...
5904405,46,2015-11-06 08:08:06.003,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...
5573282,46,2015-11-06 08:08:19.343,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...


In [35]:
# Helper functions
def create_df_for_pivot(df: pd.DataFrame):
  df = df.copy()
  #Create consecutive column with the consecutive activity and count how many time each pair occurs
  df["Consecutive_1"] = df.groupby("SessionID")["Activity"].shift(periods=-1)

  #Count all the pairs and safe in new DataFrame
  df_for_pivot_1 = pd.DataFrame(Counter(list(zip(df['Activity'], df['Consecutive_1'],))), index=["value"]).T.reset_index()

  df_for_pivot_1.fillna("end_session", inplace=True)
  return df_for_pivot_1
  
def make_pivot(df, index_names, column_names):
  df = df.copy()
  """
  Functions that takes in a DataFrame and returns a pivot table with all the chances
  
  Create pivot table where chances are calculated that each row is succeded by activity that is represented in the column
  """
  df_chances = df.pivot_table(index=index_names, columns=column_names, values='value')
  
  #Calculate what the probabilities are by summing the row and dividing all the values in the row by total sum of the row
  df_chances["total_row_count"] = df_chances.sum(axis=1)
  df_chances = df_chances.div(df_chances["total_row_count"], axis=0)
  
  df_chances.fillna(0, inplace=True)
  df_chances.drop("total_row_count", axis=1, inplace=True)

  return df_chances

  
# Create df for pivot
df_for_pivot = create_df_for_pivot(base_df)
df_chances_1 = make_pivot(df_for_pivot, "level_0", "level_1")

#Calculate the probability of each activity that it is the first activity performed in the session
df_chances_1["start_session_chance"] = base_df.groupby("SessionID").nth(0)["Activity"].value_counts() / base_df.groupby("SessionID").nth(0)["Activity"].value_counts().sum()
df_chances_1.fillna(0, inplace=True)

df_chances_total = df_chances_1 
df_chances_total

#Split the start probabilities from the normal DataFrame
start_chances = df_chances_total["start_session_chance"][df_chances_total["start_session_chance"] > 0]

#Create df with all chances except the starting chance.
final_df = df_chances_total.iloc[:, :-1]
# final_df = remove_circles(final_df)
final_df

level_1,/,//,//portal/page/portal/werk_nl/werknemer/solliciteren/solliciteren-werk-zoeken/sollicitatiebrief,//werk_nl/werknemer/contact,//werk_nl/werknemer/contact/email_uw_vraag,//werk_nl/werknemer/contact/vestiging-zoeken,//werk_nl/werknemer/home,//werk_nl/werknemer/over-werk-nl/handleiding,//werk_nl/werknemer/over-werk-nl/handleiding/cv-plaatsen,//werk_nl/werknemer/over-werk-nl/handleiding/sollicitatie-wijziging,...,/xpsitem/wdo_013389,/xpsitem/wdo_013407,/xpsitem/wdo_013827,/xpsitem/wdo_014041,/zoeken_portlet/ajax/addVacatureToGereageerd,/zoeken_portlet/ajax/zoekAantalIndicatief,/zoeken_portlet/ajax/zoekBeroep,/zoeken_portlet_wg/ajax/selectContactpersoon,/zoeken_portlet_wg/ajax/selectVacature,end_session
level_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
/,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.073834
//,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.035714
//portal/page/portal/werk_nl/werknemer/solliciteren/solliciteren-werk-zoeken/sollicitatiebrief,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000
//werk_nl/werknemer/contact,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000
//werk_nl/werknemer/contact/email_uw_vraag,0.0,0.0,0.0,0.0,0.029412,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.058824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
/zoeken_portlet/ajax/addVacatureToGereageerd,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017588,0.002513,0.005025,0.0,0.0,0.055276
/zoeken_portlet/ajax/zoekAantalIndicatief,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000107,0.537184,0.051269,0.0,0.0,0.007539
/zoeken_portlet/ajax/zoekBeroep,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.441592,0.489910,0.0,0.0,0.003979
/zoeken_portlet_wg/ajax/selectContactpersoon,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000


In [36]:
patterns = pd.read_csv("./gen_patterns/Swapped.csv", encoding_errors="ignore", on_bad_lines='skip', sep=",",)
patterns.sort_values("Activity_0_x", inplace=True)

# change column names
patterns_df = patterns.rename(columns={'Activity_0_x':'A','Activity_1_x':'B'})
patterns_df
grouped = patterns_df.drop(["count_x"	,"count_y",	"dif","fac"],axis=1).groupby("A")['B'].apply(list)
group_dict = grouped.to_dict()
# group_dict

In [50]:
def swapping(_df:pd.DataFrame,event_1:str,event_2:str,deviation_rate):
  df = _df.copy()
  swapped = []
  deviation_rate = deviation_rate / 100
  print(f"swapping event {event_1} with {event_2} with deviation rate: {deviation_rate}")
  for i,val in df.loc[:,event_1].items():
    if val > 0:
      old_val = df.loc[i,event_1]
      new_val = old_val * (1 - deviation_rate)
      df.loc[i,event_1] = new_val

      df.loc[i,event_2] += old_val - df.loc[i,event_1]
      row = {'from':i,'to':event_2,'inject':event_1,}
      swapped.append(row)
        
  
  return (df,swapped)

In [48]:
def swap_multiple_events(dic,deviation_rate,_matrix):
  res = []
  _matrix = _matrix.copy()
  
  for key, value in dic.items():
    len_list = len(value)
    dev = deviation_rate / len_list
    for i in range(len_list):
      mat,row = swapping(_matrix,key,value[i],dev)
      _matrix = mat
      res.append(row)
    
  return res
    

In [51]:
swap_multiple_events(group_dict,final_df,final_df)

swapping event //werk_nl/werknemer/werkmap with /werk_nl/werknemer/mijn_werkmap/doorgeven/taken with deviation rate: level_1                                               /   //  \
level_0                                                        
/                                                   0.0  0.0   
//                                                  0.0  0.0   
//portal/page/portal/werk_nl/werknemer/sollicit...  0.0  0.0   
//werk_nl/werknemer/contact                         0.0  0.0   
//werk_nl/werknemer/contact/email_uw_vraag          0.0  0.0   
...                                                 ...  ...   
/zoeken_portlet/ajax/addVacatureToGereageerd        0.0  0.0   
/zoeken_portlet/ajax/zoekAantalIndicatief           0.0  0.0   
/zoeken_portlet/ajax/zoekBeroep                     0.0  0.0   
/zoeken_portlet_wg/ajax/selectContactpersoon        0.0  0.0   
/zoeken_portlet_wg/ajax/selectVacature              0.0  0.0   

level_1                                           

ValueError: Incompatible indexer with DataFrame

In [6]:
def generate_markov_chain(amount:int,start_chances,df_dict:pd.DataFrame,bookkeeping:pd.DataFrame) -> list:
  generated_sessions = []
  while len(generated_sessions) < amount:
    new_session = []
    # choose start activity
    start_activity = np.random.choice(a = np.array(start_chances.index), size = 1, p = np.array(start_chances.values))
    # start_activity = np.random.choice(a = np.array(df_dict.iloc[0,:].index), size = 1, p = np.array(df_dict.iloc[0,:].values))
    new_session.append(start_activity[0])

    
    while new_session[-1] != 'end_session':
    # while new_session[-1] != 'E':
      curr_activity = new_session[-1]
      row_activity = df_dict.loc[curr_activity,:]
      activity = np.random.choice(a = list(row_activity.keys()), size = 1, p = list(row_activity.values))
      new_session.append(activity[0])
      
      # check if last 2 activities are found in the bookkeeping
      index = bookkeeping[((bookkeeping['from'] == curr_activity) & (bookkeeping['to'] == activity[0]))].index.tolist()
      if index:
        # pattern has been reached
        # next activity should be injected from bookkeeping
        injected_activity = bookkeeping.iloc[index]['inject'].values[0]
        curr_activity = new_session[-1]
        row_activity = df_dict.loc[curr_activity,:]
        activity = np.random.choice(a = list(row_activity.keys()), size = 1, p = list(row_activity.values))
        new_session.append(injected_activity)
        new_session.append(activity[0])

    generated_sessions.append(new_session)        
    
    if len(generated_sessions) % 250 == 0: print(len(generated_sessions))
  
  return generated_sessions

In [7]:
def generate_sessions(skipping_list:list,total_amount:int):
  session_per_activity = total_amount / len(skipping_list)
  sessions = []
  for item in skipping_list:
    markov = generate_markov_chain(session_per_activity,start_chances,item["new_df"],item['book'])
    sessions += markov
  return sessions
  

markov_result = generate_sessions(swapped,2500)
# res
print(len(markov_result))

250
500
750
250
500
750
250
500
750
2502


In [8]:
def markov_result_to_df(list_markov_result, name_index, name_csv):
    activity_list_final = []
    session_id_list = []
    for nbr in range(len(markov_result)):
        string_generated = name_index + str(nbr)
        for i in markov_result[nbr]:
            if i != "end_session":
                activity_list_final.append(i)
                session_id_list.append(string_generated)
                
    df_generated = pd.DataFrame(list(zip(session_id_list, activity_list_final)),
               columns =['SessionID', 'URL_FILE'])
    df_generated.to_csv(name_csv)
    
markov_result_to_df(markov_result, "generated_top3_50%_", "gen_sessions/swapped/generated__top3_50%_2500.csv")