In [1]:
import pandas as pd
from collections import Counter
import numpy as np

# base_df = pd.read_csv("../no_repeated.csv", encoding_errors="ignore", on_bad_lines='skip', sep=",",)
base_df = pd.read_csv("../BPI2016_Clicks_Logged_In.csv", encoding_errors="ignore", on_bad_lines='skip', sep=";",usecols=['SessionID', 'TIMESTAMP', 'URL_FILE', ])
base_df.rename(columns={'URL_FILE':'Activity'}, inplace=True)
base_df["TIMESTAMP"] = pd.to_datetime(base_df["TIMESTAMP"], infer_datetime_format=True)
base_df = base_df.sort_values(["SessionID", "TIMESTAMP"])
base_df.head()

Unnamed: 0,SessionID,TIMESTAMP,Activity
3273278,46,2015-11-06 08:07:22.780,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken
5333642,46,2015-11-06 08:07:40.767,/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn...
3733243,46,2015-11-06 08:07:51.390,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_d...
5904405,46,2015-11-06 08:08:06.003,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...
5573282,46,2015-11-06 08:08:19.343,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...


In [2]:
# Helper functions
def create_df_for_pivot(df: pd.DataFrame):
  df = df.copy()
  #Create consecutive column with the consecutive activity and count how many time each pair occurs
  df["Consecutive_1"] = df.groupby("SessionID")["Activity"].shift(periods=-1)

  #Count all the pairs and safe in new DataFrame
  df_for_pivot_1 = pd.DataFrame(Counter(list(zip(df['Activity'], df['Consecutive_1'],))), index=["value"]).T.reset_index()

  df_for_pivot_1.fillna("end_session", inplace=True)
  return df_for_pivot_1
  
def make_pivot(df, index_names, column_names):
  df = df.copy()
  """
  Functions that takes in a DataFrame and returns a pivot table with all the chances
  
  Create pivot table where chances are calculated that each row is succeded by activity that is represented in the column
  """
  df_chances = df.pivot_table(index=index_names, columns=column_names, values='value')
  
  #Calculate what the probabilities are by summing the row and dividing all the values in the row by total sum of the row
  df_chances["total_row_count"] = df_chances.sum(axis=1)
  df_chances = df_chances.div(df_chances["total_row_count"], axis=0)
  
  df_chances.fillna(0, inplace=True)
  df_chances.drop("total_row_count", axis=1, inplace=True)

  return df_chances

  
# Create df for pivot
df_for_pivot = create_df_for_pivot(base_df)
df_chances_1 = make_pivot(df_for_pivot, "level_0", "level_1")

#Calculate the probability of each activity that it is the first activity performed in the session
df_chances_1["start_session_chance"] = base_df.groupby("SessionID").nth(0)["Activity"].value_counts() / base_df.groupby("SessionID").nth(0)["Activity"].value_counts().sum()
df_chances_1.fillna(0, inplace=True)

df_chances_total = df_chances_1 
df_chances_total

#Split the start probabilities from the normal DataFrame
start_chances = df_chances_total["start_session_chance"][df_chances_total["start_session_chance"] > 0]

#Create df with all chances except the starting chance.
final_df = df_chances_total.iloc[:, :-1]

final_df

level_1,/,//,//portal/page/portal/werk_nl/werknemer/solliciteren/solliciteren-werk-zoeken/sollicitatiebrief,//werk_nl/werknemer/contact,//werk_nl/werknemer/contact/email_uw_vraag,//werk_nl/werknemer/contact/vestiging-zoeken,//werk_nl/werknemer/home,//werk_nl/werknemer/over-werk-nl/handleiding,//werk_nl/werknemer/over-werk-nl/handleiding/cv-plaatsen,//werk_nl/werknemer/over-werk-nl/handleiding/sollicitatie-wijziging,...,/xpsitem/wdo_013389,/xpsitem/wdo_013407,/xpsitem/wdo_013827,/xpsitem/wdo_014041,/zoeken_portlet/ajax/addVacatureToGereageerd,/zoeken_portlet/ajax/zoekAantalIndicatief,/zoeken_portlet/ajax/zoekBeroep,/zoeken_portlet_wg/ajax/selectContactpersoon,/zoeken_portlet_wg/ajax/selectVacature,end_session
level_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
/,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.073834
//,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.035714
//portal/page/portal/werk_nl/werknemer/solliciteren/solliciteren-werk-zoeken/sollicitatiebrief,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000
//werk_nl/werknemer/contact,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000
//werk_nl/werknemer/contact/email_uw_vraag,0.0,0.0,0.0,0.0,0.029412,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.058824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
/zoeken_portlet/ajax/addVacatureToGereageerd,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017588,0.002513,0.005025,0.0,0.0,0.055276
/zoeken_portlet/ajax/zoekAantalIndicatief,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000107,0.537184,0.051269,0.0,0.0,0.007539
/zoeken_portlet/ajax/zoekBeroep,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.441592,0.489910,0.0,0.0,0.003979
/zoeken_portlet_wg/ajax/selectContactpersoon,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000


In [3]:
def change_matrix(_matrix:pd.DataFrame, _from:str, _to:list[str],_replace:list[str],dev:int):
  matrix = _matrix.copy()
  deviation_rate = dev / 100
  # _replace is always len(3)
  i = 0
  while i < 3:
    # get the prob from _from to _replace[i]
    old_prob = matrix.loc[_from, _replace[i]]
    new_prob = old_prob * (1 - deviation_rate)
    matrix.loc[_from, _replace[i]] = new_prob
    len_to = len(_to)
    increase_prob = (old_prob - new_prob) / len_to
    # loop over all _to
    for j in range(len_to):
      matrix.loc[_from, _to[j]] += increase_prob
      
    i += 1
  
  return matrix

In [4]:
# import patterns
patterns = pd.read_csv("./gen_patterns/Replaced.csv", encoding_errors="ignore", on_bad_lines='skip', sep=",",)
patterns.head()
patterns_df = patterns.copy()
patterns_df

Unnamed: 0,Activity_0,Activity_1,Activity_2,replace
0,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,//werk_nl/werknemer/home,/werk_nl/werknemer/home,['/werk_nl/werknemer/mijn_werkmap/doorgeven/mi...
1,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,/portal/page/portal/werk_nl/werknemer/contact,/werk_nl/werknemer/home,['/werk_nl/werknemer/mijn_werkmap/doorgeven/mi...
2,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,/portal/page/portal/werk_nl/werknemer/eintake/...,/werk_nl/werknemer/home,['/werk_nl/werknemer/mijn_werkmap/doorgeven/mi...
3,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,/portal/page/portal/werk_nl/werknemer/over-wer...,/werk_nl/werknemer/home,['/werk_nl/werknemer/mijn_werkmap/doorgeven/mi...
4,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,/portal/page/portal/werk_nl/werknemer/over-wer...,/werk_nl/werknemer/home,['/werk_nl/werknemer/mijn_werkmap/doorgeven/mi...
...,...,...,...,...
86,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...,/xpsimage/wdo_014521,/werk_nl/werknemer/home,['/werk_nl/werknemer/mijn_werkmap/doorgeven/ta...
87,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...,/xpsitem/wdo212424,/werk_nl/werknemer/home,['/werk_nl/werknemer/mijn_werkmap/doorgeven/ta...
88,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...,/xpsitem/wdo_013142,/werk_nl/werknemer/home,['/werk_nl/werknemer/mijn_werkmap/doorgeven/ta...
89,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...,/zoeken_portlet/ajax/addVacatureToGereageerd,/werk_nl/werknemer/home,['/werk_nl/werknemer/mijn_werkmap/doorgeven/ta...


In [5]:
# def gen_patters(_patterns:pd.DataFrame) -> list:
#   patterns = _patterns.copy()
#   res = []
#   # loop over the rows in the dataframe
#   for row in patterns.iterrows():
#       # get the values of the row
#       res.append([row[1][0], row[1][1], row[1][2]])
        
        
#   return res

# test = gen_patters(patterns_df)
# test
# markov_result_to_df(test, 'only_patterns','gen_sessions/only_patterns.csv')

In [6]:
from ast import literal_eval
def create_obj(_df:pd.DataFrame):
  obj = {}
  uniq_A = patterns_df["Activity_0"].unique()
  for item in uniq_A:
    obj[item] = {}
    # get all rows where Activity_0 == item
    list_ = _df[_df["Activity_0"] == item]
    obj[item]["Activity_1"] = list_["Activity_1"].unique()
    obj[item]["replace"] = literal_eval(list_["replace"].unique()[0])
  return obj
    
    

In [7]:
patterns_obj = create_obj(patterns_df)


In [8]:
def change_multiple_patterns(_matrix:pd.DataFrame, _obj:dict, _dev:int):
  matrix = _matrix.copy()
  for key in _obj:
    matrix = change_matrix(matrix, key, _obj[key]["Activity_1"], _obj[key]["replace"], _dev)
  return matrix

In [9]:
def gen_markov_chain(amount:int,start_chances,df_dict:pd.DataFrame, patterns_df:pd.DataFrame,inject:str) -> list:
  df_dict = df_dict.copy()
  generated_sessions = []
  while len(generated_sessions) < amount:
    flag = False
    new_session = []
    # choose start activity
    start_activity = np.random.choice(a = np.array(start_chances.index), size = 1, p = np.array(start_chances.values))
    new_session.append(start_activity[0])
    
    while new_session[-1] != 'end_session':
      curr_activity = new_session[-1]
      row_activity = df_dict.loc[curr_activity,:]
      new_activity = np.random.choice(a = list(row_activity.keys()), size = 1, p = list(row_activity.values))[0]
      new_session.append(new_activity)
      # only check if pattern is generated while the flag is false
      if flag == False and ((patterns_df['Activity_0'] == curr_activity) & (patterns_df['Activity_1'] == new_activity)).any():
        # pattern is generated
        flag = True
        new_session.append(inject)
    
    if flag: 
      generated_sessions.append(new_session)
      if len(generated_sessions)  % 50 == 0 : print(len(generated_sessions))

  return generated_sessions


In [10]:
def markov_result_to_df(markov_result, name_index, name_csv):
    activity_list_final = []
    session_id_list = []
    for nbr in range(len(markov_result)):
        string_generated = name_index + str(nbr)
        for i in markov_result[nbr]:
            if i != "end_session":
                activity_list_final.append(i)
                session_id_list.append(string_generated)
                
    df_generated = pd.DataFrame(list(zip(session_id_list, activity_list_final)),
               columns =['SessionID', 'URL_FILE'])
    df_generated.to_csv(name_csv,index=False)

In [11]:
deviation_rates = [5,10,25,50,75,100]

In [12]:
def gen_all_sessions(rates:list[int],amount:int,base_df=final_df, start_chances_df=start_chances, patterns_obj=patterns_obj, patterns_df=patterns_df,inject="/werk_nl/werknemer/home"):
  for rate in rates:
    print(f'start with rate: {rate}')
    print(f'Generate the prob matrix for rate: {rate}')
    new_df = base_df.copy()
    prob_matrix = change_multiple_patterns(new_df, patterns_obj, rate)
    print(f'Generate sessions rate: {rate}')
    sessions = gen_markov_chain(amount,start_chances_df,prob_matrix,patterns_df,inject)
    print(f'Sessions Generated')
    markov_result_to_df(sessions,f'rate_{rate}_',f'gen_sessions/{rate}_{amount}.csv')
    print(f'done with {rate}')
    

In [13]:
gen_all_sessions(deviation_rates,5000)

start with rate: 5
Generate the prob matrix for rate: 5
Generate sessions rate: 5
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
4150
4200
4250
4300
4350
4400
4450
4500
4550
4600
4650
4700
4750
4800
4850
4900
4950
5000
Sessions Generated
done with 5
start with rate: 10
Generate the prob matrix for rate: 10
Generate sessions rate: 10
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
34