In [None]:
from mlxtend.frequent_patterns import apriori
import pandas as pd
from itertools import combinations
import os
import json
import csv
from itertools import product
from collections import Counter
import warnings

In [None]:

def GSPG(single_sequence, sequences_list):

### find the list of sequences which consist of the subset_seq
    def find_subset_sequences(subset_seq, set_of_seqs):
        result = []
        for seq in set_of_seqs:
            subset_index = 0
            for item in seq:
                if item[0] == subset_seq[subset_index]:
                    subset_index += 1
                    if subset_index == len(subset_seq):
                        result.append(seq)
                        break
        return result
    ### generates all possible sequences where single_sequence can be embedded within a larger sequence, 
    ### filling non-matching positions with zeros.
    def generate_sequences(subset, larger):
        def recurse(subset_idx, last_idx, current_sequence):
            if subset_idx == len(subset):
                sequences.append(current_sequence.copy())
                return
            for i in range(last_idx + 1, len(larger)):
                if larger[i][0] == subset[subset_idx]:
                    new_sequence = current_sequence.copy()
                    new_sequence[i] = larger[i]
                    recurse(subset_idx + 1, i, new_sequence)

        sequences = []
        recurse(0, -1, [(0, '0:00-0:00')] * len(larger))
        return sequences
    
    matching_sequences = find_subset_sequences(single_sequence, sequences_list)
    
    pre_seq = []
    for seq in matching_sequences:
        zero_seq = generate_sequences(single_sequence, seq)
        pre_seq.append(zero_seq)

    return pre_seq


In [None]:
### the output will show the time Gap (median value) between each pair of elements in single_sequence

import datetime
import statistics

def parse_time_interval(interval):
    start, end = interval.split('-')
    start = datetime.datetime.strptime(start, '%H:%M')
    end = datetime.datetime.strptime(end, '%H:%M')
    return start, end

def calculate_time_distance(start_time, end_time):
    delta = end_time - start_time
    return delta.total_seconds() / 60  # Convert to minutes

def get_time_distances(dataset, se1):
    distances = { (se1[i], se1[i+1]): [] for i in range(len(se1)-1) }

    for day_index, day_data in enumerate(dataset):
        seen_pairs = set()
        for sequence in day_data:
            for i in range(len(sequence) - 1):
                value1, interval1 = sequence[i]
                value2, interval2 = sequence[i+1]
                if (value1, value2) in distances:
                    start_time1, end_time1 = parse_time_interval(interval1)
                    start_time2, end_time2 = parse_time_interval(interval2)
                    distance = calculate_time_distance(end_time1, start_time2)
                    pair = (value1, value2, interval1, interval2, day_index)
                    if pair not in seen_pairs:
                        distances[(value1, value2)].append(distance)
                        seen_pairs.add(pair)
                        #print(seen_pairs)

    return distances

def calculate_median_distances(distances):
    median_distances = {}
    for pair, times in distances.items():
        if times:
            median_distances[pair] = statistics.median(times)
        else:
            median_distances[pair] = None
    return median_distances

def analyze_time_distances(dataset, se1):
    time_distances = get_time_distances(dataset, se1)
    median_distances = calculate_median_distances(time_distances)
    result = [(pair[0], pair[1], median) for pair, median in median_distances.items()]
    return result



In [None]:
### median duration and median starting time of extracted frequent patterns
import numpy as np
from datetime import timedelta

def duration_starting(dataset, sequence):
    # Function to convert time string to minutes from midnight
    def time_to_minutes(t):
        h, m = map(int, t.split(':'))
        return h * 60 + m

    # Function to calculate duration in minutes
    def duration_to_minutes(start, end):
        start_minutes = time_to_minutes(start)
        end_minutes = time_to_minutes(end)
        if end_minutes < start_minutes:
            end_minutes += 24 * 60  # Adjust for times crossing midnight
        return end_minutes - start_minutes

    # Function to find median
    def median(lst):
        return np.median(np.array(lst))

    # Process data
    result = []
    for idx, val in enumerate(sequence):
        start_times = []
        durations = []
        unique_intervals = set()
        for group_index, group in enumerate(dataset):
            for day in group:
                interval = day[idx]
                # Use a tuple (group_index, interval) to ensure uniqueness across groups
                unique_key = (group_index, interval)
                if unique_key not in unique_intervals:
                    unique_intervals.add(unique_key)
                    start, end = interval[1].split('-')
                    start_times.append(time_to_minutes(start))
                    durations.append(duration_to_minutes(start, end))
        
        median_start_time = median(start_times)
        median_duration = median(durations)
        
        # Convert median start time back to HH:MM format
        median_start_time_str = str(timedelta(minutes=int(median_start_time)))[:-3]
        
        result.append((val, median_start_time_str, median_duration))

    return result



In [None]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from itertools import combinations

def HGSP(len_GSP, min_support, data):
    try:
        dataset = [[item[0] for item in sublist] for sublist in data]
        
        def frozenset_to_list(itemset):
            return list(itemset)
        
        def clean_and_convert(itemset):
            inner_tuple = itemset[0]
            return list(inner_tuple)
        
        def remove_zero_elements(data):
            return [[[(a, b) for (a, b) in sublist if (a, b) != (0, '0:00-0:00')] for sublist in main_list] for main_list in data]

        if len_GSP == 1:
            unique_items = set(item for sublist in dataset for item in sublist)
            binary_data = []
            for item_combinations in dataset:
                binary_sequence = {item: 1 if item in item_combinations else 0 for item in unique_items}
                binary_data.append(binary_sequence)
                
            df = pd.DataFrame(binary_data)
            
            # Apply Apriori algorithm to find frequent sequential patterns of length 1
            frequent_patterns = apriori(df, min_support=min_support, use_colnames=True)
            FP = frequent_patterns[frequent_patterns['itemsets'].apply(lambda x: len(x) == 1)]
            FP['itemsets'] = FP['itemsets'].apply(frozenset_to_list)
            #FP['GSP'] = FP['itemsets'].apply(clean_and_convert)
            Time_Gap = []
            for sequ in FP['itemsets']:
                result_ses = GSPG(sequ, data)
                result_ses = remove_zero_elements(result_ses)
                gaps = analyze_time_distances(result_ses, sequ)
                Time_Gap.append(gaps)
            FP['Time Gaps'] = Time_Gap
            
            StartingT_Duration = []
            for sequ in FP['itemsets']:
                result_ses = GSPG(sequ, data)
                result_ses = remove_zero_elements(result_ses)
                starting_duration = duration_starting(result_ses, sequ)
                StartingT_Duration.append(starting_duration)
            FP['Starting Time and Duration'] = StartingT_Duration

            if FP.empty:
                print("no output")
            else:
                return FP
        
        else:
            combinations_sequences = []
            for sequence in dataset:
                combinations_sequence = list(combinations(sequence, len_GSP))
                combinations_sequences.append(combinations_sequence)
            
            unique_items = set(item for sublist in combinations_sequences for item in sublist)
            
            binary_data = []
            for item_combinations in combinations_sequences:
                binary_sequence = {item: 1 if item in item_combinations else 0 for item in unique_items}
                binary_data.append(binary_sequence)
            
            df = pd.DataFrame(binary_data)
            
            # Apply Apriori algorithm to find frequent sequential patterns of length len_GSP
            frequent_patterns = apriori(df, min_support=min_support, use_colnames=True)
            FP = frequent_patterns[frequent_patterns['itemsets'].apply(lambda x: len(x) == 1)]
            
            # Convert frozensets to lists and clean the itemsets
            FP['itemsets'] = FP['itemsets'].apply(frozenset_to_list)
            FP['GSP'] = FP['itemsets'].apply(clean_and_convert)
            Time_Gap = []
            for sequ in FP['GSP']:
                result_ses = GSPG(sequ, data)
                result_ses = remove_zero_elements(result_ses)
                gaps = analyze_time_distances(result_ses, sequ)
                Time_Gap.append(gaps)
            FP['Time Gaps'] = Time_Gap
            
            StartingT_Duration = []
            for sequ in FP['GSP']:
                result_ses = GSPG(sequ, data)
                result_ses = remove_zero_elements(result_ses)
                starting_duration = duration_starting(result_ses, sequ)
                StartingT_Duration.append(starting_duration)
            FP['Starting Time and Duration'] = StartingT_Duration

            if FP.empty:
                print("no output")
            else:
                return FP
    except Exception as e:
        print("no output")



In [None]:
### an example
df1 = [
    [ (3, '10:46-14:21'), (1, '14:22-14:54'),(7, '15:01-15:58'),(8, '18:53-20:04')],
    [(9, '10:04-16:44'), (3, '11:33-19:34')],
    [(10, '8:04-8:05'), (6, '9:01-11:33'), (3, '11:54-11:55')],
    [(3, '10:01-12:02'),(7, '14:41-15:52'),(8, '19:53-19:54')],
    [(3, '6:51-7:45'),(7, '9:44-10:01'), (1, '12:34-13:45'), (4, '14:22-17:54'),(8, '16:53-19:34')],
    [(3, '7:46-8:21'),(6, '9:01-12:02'),(7, '14:41-15:52')],
    [(3, '7:11-7:35'),(10, '7:44-8:12'), (7, '9:34-13:45'), (2, '17:22-19:54'),(8, '19:59-21:04')]]

In [None]:
GP = HGSP(1,0.5,df1)
print(GP)

In [None]:
GP = HGSP(2,0.5,df1)
print(GP)

In [None]:
GP = HGSP(3,0.5,df1)
print(GP)