In [1]:
import pandas as pd
from collections import Counter
import numpy as np

df = pd.read_csv("fake_data.csv", encoding_errors="ignore", on_bad_lines='skip', sep=",",)
# df = pd.read_csv("BPI2016_Clicks_Logged_In.csv", encoding_errors="ignore", on_bad_lines='skip', sep=";",
#                 usecols=['CustomerID', 'AgeCategory', 'Gender', 'Office_U', 'Office_W',
#        'SessionID', 'IPID', 'TIMESTAMP', 'VHOST', 'URL_FILE', 'PAGE_NAME',
#        'REF_URL_category', 'page_load_error', 'page_action_detail', 'tip',
#        'service_detail', 'xps_info'])
df.rename(columns={'URL_FILE':'Activity'}, inplace=True)

In [2]:
df["TIMESTAMP"] = pd.to_datetime(df["TIMESTAMP"], infer_datetime_format=True)
df = df.sort_values(["SessionID", "TIMESTAMP"])

In [3]:
#Create consecutive column with the consecutive activity and count how many time each pair occurs
df["Consecutive_1"] = df.groupby("SessionID")["Activity"].shift(periods=-1)

#Count all the pairs and safe in new DataFrame
df_for_pivot_1 = pd.DataFrame(Counter(list(zip(df['Activity'], df['Consecutive_1'],))), index=["value"]).T.reset_index()

df_for_pivot_1.fillna("end_session", inplace=True)
df_for_pivot_1

Unnamed: 0,level_0,level_1,value
0,Start_application,Input_info,2
1,Input_info,Send_application,2
2,Send_application,Accept_offer,3
3,Accept_offer,end_session,4
4,Input_info,end_session,1


In [4]:
def make_pivot(df, index_names, column_names):
    """
    Functions that takes in a DataFrame and returns a pivot table with all the chances
    
    Create pivot table where chances are calculated that each row is succeded by activity that is represented in the column
    """
    df_chances = df.pivot_table(index=index_names, columns=column_names, values='value')
    
    #Calculate what the probabilities are by summing the row and dividing all the values in the row by total sum of the row
    df_chances["total_row_count"] = df_chances.sum(axis=1)
    df_chances = df_chances.div(df_chances["total_row_count"], axis=0)
    
    df_chances.fillna(0, inplace=True)
    df_chances.drop("total_row_count", axis=1, inplace=True)

    return df_chances

In [5]:
df_chances_1 = make_pivot(df_for_pivot_1, "level_0", "level_1")

#Calculate the probability of each activity that it is the first activity performed in the session
df_chances_1["start_session_chance"] = df.groupby("SessionID").nth(0)["Activity"].value_counts() / df.groupby("SessionID").nth(0)["Activity"].value_counts().sum()
df_chances_1.fillna(0, inplace=True)

In [6]:
df_chances_total = df_chances_1 
df_chances_total

#Split the start probabilities from the normal DataFrame
start_chances = df_chances_total["start_session_chance"][df_chances_total["start_session_chance"] > 0]

#Create df with all chances except the starting chance.
df_chances_rest = df_chances_total.iloc[:, :-1]



In [7]:
df_chances_rest
# start_chances

level_1,Accept_offer,Input_info,Send_application,end_session
level_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Accept_offer,0.0,0.0,0.0,1.0
Input_info,0.0,0.0,0.666667,0.333333
Send_application,1.0,0.0,0.0,0.0
Start_application,0.0,1.0,0.0,0.0


Try to find paths of length 3.
A - B - C
Sort on chance. 
So chance a to b * b to c 

In [8]:
def find_paths(_df:pd.DataFrame,len:int = 3):
  df = _df.copy()
  col = df.columns.values
  row  = df.index.values
  s  = row.size
  res = []
  while s > 0:
    current_row = df.iloc[row.size - s]
    i = 0
    while i < current_row.size:
      current_col = df.columns[i]
      chance = current_row.iloc[i]
      if chance > 0:
        a = current_row.name
        b = current_col
        
        if (df.index == b).any() :
          _r = df.loc[b]
          index = 0
          for r in _r:
            if r > 0:
              res.append([a,b,col.item(index),r*chance,chance,r])  
            index += 1        
            
      i += 1
    s -= 1

  return res

# paths = find_paths(df_chances_rest)

In [16]:
def create_test_df_circle():
  d = {'A': [0,0,0,0],'B': [1,0.2,0,0.2], 'C': [0,0.4,0,0],'D':[0,0.4,1,0],'E':[0,0,0,0.8]}
  df = pd.DataFrame(data=d)
  df.index = ['A','B','C','D']
  return df

def create_test_df():
  d = {'A': [0,0,0,0],'B': [1,0,0,0.2], 'C': [0,1,0,0],'D':[0,0,1,0],'E':[0,0,0,0.8]}
  df = pd.DataFrame(data=d)
  df.index = ['A','B','C','D']
  return df

test_df = create_test_df() 
# test_df = create_test_df_circle() 
test_df 

Unnamed: 0,A,B,C,D,E
A,0,1.0,0,0,0.0
B,0,0.0,1,0,0.0
C,0,0.0,0,1,0.0
D,0,0.2,0,0,0.8


In [17]:
def skip_event_all_possibilities(_df:pd.DataFrame,event:str,deviation_rate):
  df = _df.copy()
  print(f"Decrease the probability to reach event: {event} with deviation rate: {deviation_rate}")
  old_probability = df.loc[:,event]
  deviation_rate = deviation_rate / 100
  
  # decrease the probability
  for i,prob in old_probability.items():
    # skip row of event
    if i == event: continue
    
    # start changing the prob 
    if prob > 0:
      decrease = prob * (1-deviation_rate)
      df.loc[i,event] -= prob - decrease      
      for _i,_prob in df.loc[i,:].items():
        df.loc[i,_i] += (prob - decrease) * df.loc[event,_i]
  
  
  # change row of event
  if df.loc[event,event] > 0:
    decrease = df.loc[event,event] * (1 - deviation_rate)
    original_value = df.loc[event,event]
    count = df.loc[event]
    count = count[count > 0].__len__() - 1
    
    
    for i,prob in  df.loc[event,:].items():
      
      if df.loc[event,i] > 0:
        # Circle case
        if i == event: 
          
          df.loc[event,i] -= prob - decrease
        else: df.loc[event,i] += (original_value - decrease) / count
        
    
      
  return df
       

# new_df = skip_event_all_possibilities(df_chances_rest,"/portal/page/portal/home/diensten/aanvragen-ww",4)
new_df = skip_event_all_possibilities(test_df,"C",50)

new_df
  

Decrease the probability to reach event: C with deviation rate: 50


Unnamed: 0,A,B,C,D,E
A,0,1.0,0.0,0.0,0.0
B,0,0.0,0.5,0.5,0.0
C,0,0.0,0.0,1.0,0.0
D,0,0.2,0.0,0.0,0.8


In [18]:
sum_check = new_df.copy()
if not "Sum" in sum_check: sum_check['Sum'] = sum_check.sum(axis=1)


if (sum_check['Sum'] == sum_check['Sum'][0]).all():
    print("All values are equal in column 'Sum'")
else:
    print("All values are not equal  in column 'Sum'")

sum_check.sort_values(['Sum'])
# sum_check['Sum'].to_csv("test.csv")

All values are equal in column 'Sum'


Unnamed: 0,A,B,C,D,E,Sum
A,0,1.0,0.0,0.0,0.0,1.0
B,0,0.0,0.5,0.5,0.0,1.0
C,0,0.0,0.0,1.0,0.0,1.0
D,0,0.2,0.0,0.0,0.8,1.0


In [30]:
def generate_markov_chain(amount:int,start_chances,df_dict:pd.DataFrame) -> list:
  generated_sessions = []
  while len(generated_sessions) < amount:
    new_session = []
    # choose start activity
    # start_activity = np.random.choice(a = np.array(start_chances.index), size = 1, p = np.array(start_chances.values))
    start_activity = np.random.choice(a = np.array(df_dict.iloc[0,:].index), size = 1, p = np.array(df_dict.iloc[0,:].values))
    new_session.append(start_activity[0])

    
    # while new_session[-1] != 'end_session':
    while new_session[-1] != 'E':
      curr_activity = new_session[-1]
      row_activity = df_dict.loc[curr_activity,:]
      activity = np.random.choice(a = list(row_activity.keys()), size = 1, p = list(row_activity.values))
      new_session.append(activity[0])
    
    generated_sessions.append(new_session)        
    
    if len(generated_sessions) % 250 == 0: print(len(generated_sessions))
  
  return generated_sessions

test = generate_markov_chain(100,start_chances,new_df)
test


[['B', 'D', 'B', 'C', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'C', 'D', 'B', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'D', 'E'],
 ['B', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'D', 'B', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'D', 'E'],
 ['B', 'D', 'B', 'D', 'E'],
 ['B', 'D', 'E'],
 ['B', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'C', 'D', 'B', 'D', 'E'],
 ['B', 'D', 'B', 'D', 'B', 'C', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'D', 'E'],
 ['B', 'D', 'E'],
 ['B', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'C', 'D', 'B', 'C', 'D', 'E'],
 ['B', 'D', 'E'],
 ['B', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'D', 'B', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'D', 'E'],
 ['B', 'D', 'B', 'D', 'E'],
 ['B', 'C', 'D', 'E'],
 ['B', 'C', 'D', 'B

In [None]:
def markov_chain(amount, start_chances, df_dict):
    lijst_aantal_gegenereerde = []

    #Loop through the df with all the chances and based on this a new session is generated
    aanvragen_ww_list = []
    while len(aanvragen_ww_list) <= amount:
        activity_list = []
        activity = np.random.choice(a = np.array(start_chances.index), size = 1, p = np.array(start_chances.values))
        activity_list.append(activity[0])

        while activity != "end_session":
            row_activity = df_dict[activity[0]]
            activity = np.random.choice(a = list(row_activity.keys()), size = 1, p = list(row_activity.values()))
            activity_list.append(activity[0])

        if "/portal/page/portal/home/diensten/aanvragen-ww" in activity_list:
            aanvragen_ww_list.append(activity_list)

        #print how many sessions are already generated by the Markov chain. Only print every 250 generated sessions
        if len(aanvragen_ww_list) % 250 == 0 and len(aanvragen_ww_list) not in lijst_aantal_gegenereerde:
            print(len(aanvragen_ww_list))
            lijst_aantal_gegenereerde.append(len(aanvragen_ww_list))
    
    return aanvragen_ww_list

markov_result = markov_chain(2000, start_chances, df_dict)
# markov_result

KeyError: 'Accept_offer'