In [2]:
import pandas as pd
from collections import Counter
import numpy as np

df = pd.read_csv("fake_data.csv", encoding_errors="ignore", on_bad_lines='skip', sep=",",)
# df = pd.read_csv("BPI2016_Clicks_Logged_In.csv", encoding_errors="ignore", on_bad_lines='skip', sep=";",
#                 usecols=['CustomerID', 'AgeCategory', 'Gender', 'Office_U', 'Office_W',
#        'SessionID', 'IPID', 'TIMESTAMP', 'VHOST', 'URL_FILE', 'PAGE_NAME',
#        'REF_URL_category', 'page_load_error', 'page_action_detail', 'tip',
#        'service_detail', 'xps_info'])
df.rename(columns={'URL_FILE':'Activity'}, inplace=True)

In [3]:
df["TIMESTAMP"] = pd.to_datetime(df["TIMESTAMP"], infer_datetime_format=True)
df = df.sort_values(["SessionID", "TIMESTAMP"])

In [4]:
#Create consecutive column with the consecutive activity and count how many time each pair occurs
df["Consecutive_1"] = df.groupby("SessionID")["Activity"].shift(periods=-1)

#Count all the pairs and safe in new DataFrame
df_for_pivot_1 = pd.DataFrame(Counter(list(zip(df['Activity'], df['Consecutive_1'],))), index=["value"]).T.reset_index()

df_for_pivot_1.fillna("end_session", inplace=True)
df_for_pivot_1

Unnamed: 0,level_0,level_1,value
0,Start_application,Input_info,2
1,Input_info,Send_application,2
2,Send_application,Accept_offer,3
3,Accept_offer,end_session,4
4,Input_info,end_session,1


In [5]:
def make_pivot(df, index_names, column_names):
    """
    Functions that takes in a DataFrame and returns a pivot table with all the chances
    
    Create pivot table where chances are calculated that each row is succeded by activity that is represented in the column
    """
    df_chances = df.pivot_table(index=index_names, columns=column_names, values='value')
    
    #Calculate what the probabilities are by summing the row and dividing all the values in the row by total sum of the row
    df_chances["total_row_count"] = df_chances.sum(axis=1)
    df_chances = df_chances.div(df_chances["total_row_count"], axis=0)
    
    df_chances.fillna(0, inplace=True)
    df_chances.drop("total_row_count", axis=1, inplace=True)

    return df_chances

In [6]:
df_chances_1 = make_pivot(df_for_pivot_1, "level_0", "level_1")

#Calculate the probability of each activity that it is the first activity performed in the session
df_chances_1["start_session_chance"] = df.groupby("SessionID").nth(0)["Activity"].value_counts() / df.groupby("SessionID").nth(0)["Activity"].value_counts().sum()
df_chances_1.fillna(0, inplace=True)

In [7]:
df_chances_total = df_chances_1 

#Split the start probabilities from the normal DataFrame
start_chances = df_chances_total["start_session_chance"][df_chances_total["start_session_chance"] > 0]

#Create df with all chances except the starting chance.
df_chances_rest = df_chances_total.iloc[:, :-1]

In [8]:
df_chances_rest

level_1,Accept_offer,Input_info,Send_application,end_session
level_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Accept_offer,0.0,0.0,0.0,1.0
Input_info,0.0,0.0,0.666667,0.333333
Send_application,1.0,0.0,0.0,0.0
Start_application,0.0,1.0,0.0,0.0


Try to find paths of length 3.
A - B - C
Sort on chance. 
So chance a to b * b to c 

In [104]:
def create_test_df_circle():
  d = {'A': [0,0,0,0],'B': [1,0.2,0,0.2], 'C': [0,0.4,0,0],'D':[0,0.4,1,0],'E':[0,0,0,0.8]}
  df = pd.DataFrame(data=d)
  df.index = ['A','B','C','D']
  return df

def create_test_df():
  d = {'A': [0,0,0,0],'B': [1,0,0,0.2], 'C': [0,0.6,0,0],'D':[0,0.4,1,0],'E':[0,0,0,0.8]}
  df = pd.DataFrame(data=d)
  df.index = ['A','B','C','D']
  return df

def create_test_df_simple():
  d = {'A': [0,0,0,0],'B': [1,0,0,0], 'C': [0,1,0,0],'D':[0,0,1,0],'E':[0,0,0,1]}
  df = pd.DataFrame(data=d)
  df.index = ['A','B','C','D']
  return df

test_df = create_test_df_simple() 
# test_df = create_test_df() 
test_df 

Unnamed: 0,A,B,C,D,E
A,0,1,0,0,0
B,0,0,1,0,0
C,0,0,0,1,0
D,0,0,0,0,1


In [105]:
def replacing(_df:pd.DataFrame,event_1:str,event_2:str,deviation_rate):
  df = _df.copy()
  replaced = []
  deviation_rate = deviation_rate / 100
  print(f"replacing event {event_1} with {event_2} with deviation rate: {deviation_rate}")
  for i,val in df.loc[:,event_1].items():
    if val > 0:
      old_val = df.loc[i,event_1]
      df.loc[i,event_1] *= 1 - deviation_rate

      df.loc[i,event_2] += old_val - df.loc[i,event_1]
      row = {'from': i,'to':event_2,'replaced':event_1,'%':(old_val - df.loc[i,event_1]) / df.loc[i,event_2]}
      replaced.append(row)
        
    
  replaced_bookkeeping = pd.DataFrame(replaced)
  return (df,replaced_bookkeeping)

replaced_df,replaced_bookkeeping = replacing(test_df,'C','D',50)
print(replaced_bookkeeping)
replaced_df

replacing event C with D with deviation rate: 0.5
  from to replaced    %
0    B  D        C  1.0


Unnamed: 0,A,B,C,D,E
A,0,1,0.0,0.0,0
B,0,0,0.5,0.5,0
C,0,0,0.0,1.0,0
D,0,0,0.0,0.0,1


In [119]:
def generate_markov_chain(amount:int,start_chances,df_dict:pd.DataFrame,bookkeeping:pd.DataFrame) -> list:
  generated_sessions = []
  while len(generated_sessions) < amount:
    new_session = []
    # choose start activity
    # start_activity = np.random.choice(a = np.array(start_chances.index), size = 1, p = np.array(start_chances.values))
    start_activity = np.random.choice(a = np.array(df_dict.iloc[0,:].index), size = 1, p = np.array(df_dict.iloc[0,:].values))
    new_session.append(start_activity[0])

    
    # while new_session[-1] != 'end_session':
    while new_session[-1] != 'E':
      curr_activity = new_session[-1]
      row_activity = df_dict.loc[curr_activity,:]
      activity = np.random.choice(a = list(row_activity.keys()), size = 1, p = list(row_activity.values))
      new_session.append(activity[0])
      
      # check if last 2 activities are found in the bookkeeping
      index = bookkeeping[((bookkeeping['from'] == curr_activity) & (bookkeeping['to'] == activity[0]))].index.tolist()
      if index:
        # pattern has been reached
        replaced_activity = bookkeeping.iloc[index]['replaced'].values[0]
        # next activity should be from the replaced row
        row = df_dict.loc[replaced_activity,:]
        next_activity = np.random.choice(a = list(row.keys()), size = 1, p = list(row.values))
        new_session.append(next_activity[0])
        

    
    generated_sessions.append(new_session)        
    
    if len(generated_sessions) % 250 == 0: print(len(generated_sessions))
  
  return generated_sessions

test = generate_markov_chain(2000,start_chances,replaced_df,replaced_bookkeeping)
test


250
500
750
1000
1250
1500
1750
2000


[['B', 'C', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'D', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'D', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'D', 'D', 'E'],
 ['B', 'D', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'D', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'D', 'D', 'E'],
 ['B', 'D', 'D', 'E'],
 ['B', 'D', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'D', 'D', 'E'],
 ['B', 'D', 'D', 'E'],
 ['B', 'D', 'D', 'E'],
 ['B', 'D', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'D', 'D', 'E'],
 ['B', 'D', 'D', 'E'],
 ['B', 'D', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'D', 'D', 'E'],
 ['B', 'D', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'D', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'D', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'D', 'D', 'E'],
 ['B', 'D', 'D', 'E'],
 ['B', 'D', 'D', 'E'],
 ['B', 'D', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'C',