In [192]:
import time
import scipy.io
import pandas as pd
from prefixspan import PrefixSpan

In [193]:
# Bize .mat dosyası verilmiş.
mat_file_path = 'data/faults.mat' 
mat_data = scipy.io.loadmat(mat_file_path)

# .mat dosyasını dönüştürüyoruz.
transaction_data = mat_data['TransactionDatabase']
char_matrix = []
for row in transaction_data:
    row_str = ''.join(chr(c) if isinstance(c, (int, float)) else c for c in row).strip()
    char_matrix.append(row_str)

split_transactions = [list(seq) for seq in char_matrix]

# .mat dosyasını DataFrame formatına sonunda dönüştürdük.
df = pd.DataFrame(split_transactions)

print(df.shape)
df.head(10)

(10000, 34)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,#,$,%,&,(,),*,+,/,0,...,,,,,,,,,,
1,#,$,%,&,(,),*,+,/,0,...,,,,,,,,,,
2,#,$,%,&,(,),*,+,/,0,...,,,,,,,,,,
3,#,$,%,&,(,),*,+,/,0,...,,,,,,,,,,
4,#,$,%,&,(,),*,+,/,0,...,,,,,,,,,,
5,#,$,%,&,(,),*,+,/,0,...,,,,,,,,,,
6,#,$,%,&,(,),*,+,/,0,...,,,,,,,,,,
7,#,$,%,&,(,),*,+,/,0,...,,,,,,,,,,
8,#,$,%,&,(,),*,+,/,0,...,,,,,,,,,,
9,#,$,%,&,(,),*,+,/,0,...,,,,,,,,,,


In [194]:
# Kaç farklı dizi var bakalım
df_uniques = df.drop_duplicates(keep=False, inplace=False)
print(df_uniques.shape)
df_uniques.head(10)
# 44 farklı dizi var.

(44, 34)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
175,#,$,%,&,(,),+,/,0,1,...,,,,,,,,,,
511,#,$,%,&,(,),/,0,1,2,...,,,,,,,,,,
599,#,$,%,&,(,),*,/,0,1,...,,,,,,,,,,
831,#,%,&,(,),1,2,3,5,<,...,,,,,,,,,,
1181,#,%,&,(,),/,1,2,8,9,...,,,,,,,,,,
1760,#,$,%,&,(,),2,5,8,:,...,,,,,,,,,,
2042,#,%,&,(,),/,0,1,2,3,...,,,,,,,,,,
2076,#,%,&,(,),/,0,1,2,3,...,,,,,,,,,,
2203,#,%,&,(,),1,2,9,:,<,...,,,,,,,,,,
2712,#,%,&,(,),2,9,:,<,@,...,,,,,,,,,,


In [195]:
# Data'yı PrefixSpan için hazırlama
prefixspan_ready_sequences = []
for row in df.itertuples(index=False):
    sequence = [str(item) for item in row if pd.notna(item) and item != '']
    prefixspan_ready_sequences.append(sequence)

In [196]:
# Örnek bir dizi
print("Örnek dizi:", prefixspan_ready_sequences[0])

Örnek dizi: ['#', '$', '%', '&', '(', ')', '*', '+', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':']


In [197]:
# Formatlamamız doğru çalışıyor.

In [198]:
# Süreyi başlat
ps_start_time = time.time()

# PrefixSpan çalıştır
ps = PrefixSpan(prefixspan_ready_sequences)

# Opsiyonel: Minimum ve maksimum dizi uzunluğu
ps.minlen = 2  
ps.maxlen = 10

# En çok beliren (support'ı en yüksek) 20 diziyi al
top_patterns = ps.topk(20) 

# Süreyi bitir
ps_end_time = time.time()

ps_time = ps_end_time - ps_start_time
print(f"PrefixSpan runtime: {ps_time:.5f} seconds")

# Buradaki 'support' bize gerçek support'ı değil, frequency'i veriyor 
print("\nTop 20 Frequent Sequential Patterns:\n")
for support, pattern in top_patterns:
    readable = ' '.join(pattern)
    print(f"Pattern: {readable}\nSupport: {support}\n")

PrefixSpan runtime: 2.69944 seconds

Top 20 Frequent Sequential Patterns:

Pattern: # %
Support: 10000

Pattern: # % &
Support: 10000

Pattern: # % & )
Support: 10000

Pattern: # % )
Support: 10000

Pattern: # &
Support: 10000

Pattern: # & )
Support: 10000

Pattern: # )
Support: 10000

Pattern: % &
Support: 10000

Pattern: % & )
Support: 10000

Pattern: % )
Support: 10000

Pattern: & )
Support: 10000

Pattern: # % & ( )
Support: 9949

Pattern: # % (
Support: 9949

Pattern: # % ( )
Support: 9949

Pattern: # & (
Support: 9949

Pattern: # & ( )
Support: 9949

Pattern: # (
Support: 9949

Pattern: # ( )
Support: 9949

Pattern: % & (
Support: 9949

Pattern: % & ( )
Support: 9949



In [199]:
# Ancak PrefixSpan ile rule (kurallar) bulamıyoruz. Confidence değeri olan kurallar bulmak için RuleGrowth kullanacağız.

In [200]:
# SPMF formatına dönüştür ve kaydet (sembolleri ASCII'ye çevir (çünkü SPMF formatı itemset olarak integer kabul ediyor), itemsetler arasına -1 , dizi sonuna -2 koy)
with open("rulegrowth_input.txt", "w") as f:
    for _, row in df.iterrows():
        sequence = [str(ord(item)) for item in row if pd.notna(item) and item != '']
        spmf_sequence = ' -1 '.join(sequence) + ' -1 -2\n'
        f.write(spmf_sequence)

print("SPMF input file saved as rulegrowth_input.txt")

SPMF input file saved as rulegrowth_input.txt


In [201]:
# SPMF programında RuleGrowth'u çalıştırıyoruz ve output dosyamızı elde ediyoruz. Parametrelerimiz:

    # Min support (%) = 0.6
    # Min confidence (%)  = 0.9
    # Max antecedent size: 10 
    # Max consequent size: 10

# Sonuç
    # Sequential rules count: 49
    # Total time: 1703 ms
    # Max memory: 172.16748046875

# Şimdi bu dosyayı okuyup okunabilir hale getireceğiz.
# (ASCII'leri karakterlere çevireceğiz)

In [202]:
import pandas as pd

def decode_rulegrowth_output(input_file='C:/Users/hzxki/Desktop/Ders/Veri Bilimi/SecondMidterm2/odev/rulegrowth_output.txt'):
    decoded_rules = []

    with open(input_file, 'r') as f:
        for line in f:
            if "==>" not in line:
                continue

            parts = line.strip().split("==>")

            # Handle LHS (antecedent), may contain comma-separated itemsets
            lhs_items = parts[0].strip().split()
            lhs = []
            for item in lhs_items:
                for subitem in item.split(','):
                    lhs.append(chr(int(subitem)))

            # Handle RHS (consequent)
            rhs_and_metrics = parts[1].strip().split("#")
            rhs_items = rhs_and_metrics[0].strip().split()
            rhs = []
            for item in rhs_items:
                for subitem in item.split(','):
                    rhs.append(chr(int(subitem)))

            # Extract support and confidence
            support = None
            confidence = None
            for metric in rhs_and_metrics[1:]:
                if "SUP" in metric:
                    support = metric.split(":")[1].strip()
                if "CONF" in metric:
                    confidence = metric.split(":")[1].strip()

            rule = {
                "Rule": f"{' '.join(lhs)} → {' '.join(rhs)}",
                "Support": support,
                "Confidence": confidence
            }
            decoded_rules.append(rule)

    return decoded_rules

decoded = decode_rulegrowth_output()

# Confidence'a göre sırala
decoded_sorted = sorted(decoded, key=lambda r: float(r['Confidence']), reverse=True)

# Confidence'ı en yüksek 20 kuralı göster
print("Top 20 Rules by Confidence:\n")
for rule in decoded_sorted[:20]:
    print(f"Rule: {rule['Rule']}")
    print(f"Support: {rule['Support']}\nConfidence: {rule['Confidence']}\n")

pd.DataFrame(decoded_sorted).to_csv("decoded_rulegrowth_rules.csv", index=False)
pd.DataFrame(decoded_sorted).to_excel("decoded_rulegrowth_rules.xlsx", index=False)

Top 20 Rules by Confidence:

Rule: # → %
Support: 10000
Confidence: 1.0

Rule: # $ → %
Support: 2809
Confidence: 1.0

Rule: # → % &
Support: 10000
Confidence: 1.0

Rule: # → % & )
Support: 10000
Confidence: 1.0

Rule: # $ → % & )
Support: 2809
Confidence: 1.0

Rule: # $ → % &
Support: 2809
Confidence: 1.0

Rule: # → % )
Support: 10000
Confidence: 1.0

Rule: # $ → % )
Support: 2809
Confidence: 1.0

Rule: # → &
Support: 10000
Confidence: 1.0

Rule: # $ → &
Support: 2809
Confidence: 1.0

Rule: # $ % → &
Support: 2809
Confidence: 1.0

Rule: # % → &
Support: 10000
Confidence: 1.0

Rule: # → & )
Support: 10000
Confidence: 1.0

Rule: # $ → & )
Support: 2809
Confidence: 1.0

Rule: # $ % → & )
Support: 2809
Confidence: 1.0

Rule: # % → & )
Support: 10000
Confidence: 1.0

Rule: # → )
Support: 10000
Confidence: 1.0

Rule: # $ → )
Support: 2809
Confidence: 1.0

Rule: # $ % → )
Support: 2809
Confidence: 1.0

Rule: # $ % & → )
Support: 2809
Confidence: 1.0



In [203]:
# CMRules
# Save in SPMF format for CMRules
with open("cmrules_input.txt", "w") as f:
    for _, row in df.iterrows():
        sequence = [ord(str(item)) for item in row if pd.notna(item) and item != '']
        if len(sequence) < 2:
            continue  # skip short sequences
        class_label = sequence[-1]  # last event is the class
        features = sequence[:-1]
        line = ' -1 '.join(map(str, features + [class_label])) + ' -1 -2\n'
        f.write(line)

print("SPMF input file for CMRules saved as cmrules_input.txt")

SPMF input file for CMRules saved as cmrules_input.txt


In [204]:
# SPMF programında CMRules'ı çalıştırıyoruz ve output dosyamızı elde ediyoruz. Parametrelerimiz:

    # Min support (%) = 0.6
    # Min confidence (%)  = 0.9
    # Max antecedent size: 10 
    # Max consequent size: 10

# Sonuç

    # Association rules count: 1446
    # Sequential rules count: 49
    # Total time : 2226 ms

# Bu dosyayı da dekode etmemiz lazım.

In [205]:
def decode_cmrules_output(input_file='C:/Users/hzxki/Desktop/Ders/Veri Bilimi/SecondMidterm2/odev/cmrules_output.txt'):
    decoded_rules = []

    with open(input_file, 'r') as f:
        for line in f:
            if "==>" not in line:
                continue

            parts = line.strip().split("==>")

            
            lhs_items = parts[0].strip().split()
            lhs = []
            for item in lhs_items:
                for subitem in item.split(','):
                    lhs.append(chr(int(subitem)))

            
            rhs_and_metrics = parts[1].strip().split("#")
            rhs_items = rhs_and_metrics[0].strip().split()
            rhs = []
            for item in rhs_items:
                for subitem in item.split(','):
                    rhs.append(chr(int(subitem)))

            
            support = None
            confidence = None
            for metric in rhs_and_metrics[1:]:
                if "SUP" in metric:
                    support = metric.split(":")[1].strip()
                if "CONF" in metric:
                    confidence = metric.split(":")[1].strip()

            rule = {
                "Rule": f"{' '.join(lhs)} → {' '.join(rhs)}",
                "Support": support,
                "Confidence": confidence
            }
            decoded_rules.append(rule)

    return decoded_rules

decoded = decode_cmrules_output()
decoded_sorted = sorted(decoded, key=lambda r: float(r['Confidence']), reverse=True)

print("20 CMRules with highest confidence:")
for rule in decoded_sorted[:20]:
    print(f"Rule: {rule['Rule']}")
    print(f"Support: {rule['Support']}\nConfidence: {rule['Confidence']}\n")

pd.DataFrame(decoded_sorted).to_csv("decoded_cmrules_rules.csv", index=False)
pd.DataFrame(decoded_sorted).to_excel("decoded_cmrules_rules.xlsx", index=False)

20 CMRules with highest confidence:
Rule: # → %
Support: 10000
Confidence: 1.0

Rule: # → &
Support: 10000
Confidence: 1.0

Rule: # → )
Support: 10000
Confidence: 1.0

Rule: % → &
Support: 10000
Confidence: 1.0

Rule: % → )
Support: 10000
Confidence: 1.0

Rule: & → )
Support: 10000
Confidence: 1.0

Rule: ( → )
Support: 9949
Confidence: 1.0

Rule: # % → &
Support: 10000
Confidence: 1.0

Rule: # → % &
Support: 10000
Confidence: 1.0

Rule: # % → )
Support: 10000
Confidence: 1.0

Rule: # → % )
Support: 10000
Confidence: 1.0

Rule: # & → )
Support: 10000
Confidence: 1.0

Rule: # → & )
Support: 10000
Confidence: 1.0

Rule: # ( → )
Support: 9949
Confidence: 1.0

Rule: % & → )
Support: 10000
Confidence: 1.0

Rule: % → & )
Support: 10000
Confidence: 1.0

Rule: % ( → )
Support: 9949
Confidence: 1.0

Rule: & ( → )
Support: 9949
Confidence: 1.0

Rule: # % & → )
Support: 10000
Confidence: 1.0

Rule: # % → & )
Support: 10000
Confidence: 1.0



In [206]:
# GSP

with open("gsp_input.txt", "w") as f:
    for _, row in df.iterrows():
        sequence = [str(ord(str(item))) for item in row if pd.notna(item) and item != '']
        line = ' -1 '.join(sequence) + ' -1 -2\n'
        f.write(line)

print("GSP input file saved as gsp_input.txt")

GSP input file saved as gsp_input.txt


In [207]:
# SPMF programında GSP'i çalıştırıyoruz ve output dosyamızı elde ediyoruz. Parametremiz:

    # Min support (%) = 0.6

# Sonuç
    # Total time ~ 697 ms
    #  Frequent sequences count : 223
    #  Max memory (mb):156.79229736328125

# Bu dosyayı da dekode etmemiz lazım.

In [208]:
def decode_gsp_output(input_file='gsp_output.txt'):
    decoded_patterns = []

    with open(input_file, 'r') as f:
        for line in f:
            if "#SUP:" not in line:
                continue

            parts = line.strip().split("#SUP:")
            sequence_part = parts[0].strip()
            support = parts[1].strip()

            ascii_items = [int(x) for x in sequence_part.split() if x not in ['-1', '-2']]
            sequence = [chr(code) for code in ascii_items]

            decoded_patterns.append({
                "Pattern": ' → '.join(sequence),
                "Support": support
            })

    return decoded_patterns

# Burada confidence seçeneğimiz yok (pattern mining yapıyoruz)
decoded_gsp = decode_gsp_output()
decoded_gsp_sorted = sorted(decoded_gsp, key=lambda x: x['Support'], reverse=True)

for pattern in decoded_gsp_sorted[:20]:
    print(f"Pattern: {pattern['Pattern']}\nSupport: {pattern['Support']}\n")

pd.DataFrame(decoded_gsp_sorted).to_csv("decoded_gsp.csv", index=False)
pd.DataFrame(decoded_gsp_sorted).to_excel("decoded_gsp.xlsx", index=False)

Pattern: (
Support: 9949

Pattern: # → (
Support: 9949

Pattern: % → (
Support: 9949

Pattern: & → (
Support: 9949

Pattern: ( → )
Support: 9949

Pattern: # → % → (
Support: 9949

Pattern: # → & → (
Support: 9949

Pattern: # → ( → )
Support: 9949

Pattern: % → & → (
Support: 9949

Pattern: % → ( → )
Support: 9949

Pattern: & → ( → )
Support: 9949

Pattern: # → % → & → (
Support: 9949

Pattern: # → % → ( → )
Support: 9949

Pattern: # → & → ( → )
Support: 9949

Pattern: % → & → ( → )
Support: 9949

Pattern: # → % → & → ( → )
Support: 9949

Pattern: <
Support: 8961

Pattern: # → <
Support: 8961

Pattern: % → <
Support: 8961

Pattern: & → <
Support: 8961



In [209]:
# SPADE
# SPADE inputu GSP ile aynı
with open("spade_input.txt", "w") as f:
    for _, row in df.iterrows():
        sequence = [str(ord(str(item))) for item in row if pd.notna(item) and item != '']
        line = ' -1 '.join(sequence) + ' -1 -2\n'
        f.write(line)

print("SPADE input file saved as spade_input.txt")

SPADE input file saved as spade_input.txt


In [210]:
# SPMF programında SPADE'i çalıştırıyoruz ve output dosyamızı elde ediyoruz. Parametremiz:

    # Min support (%) = 0.6

# Sonuç
     # Total time ~ 385 ms
     # Frequent sequences count : 223
     # Join count : 862
     # Max memory (mb):59.689247131347656

# Bu dosyayı da dekode etmemiz lazım.

In [211]:
def decode_spade_output(input_file='spade_output.txt'):
    decoded_patterns = []

    with open(input_file, 'r') as f:
        for line in f:
            if "#SUP:" not in line:
                continue

            parts = line.strip().split("#SUP:")
            sequence_part = parts[0].strip()
            support = parts[1].strip()

            ascii_items = [int(x) for x in sequence_part.split() if x not in ['-1', '-2']]
            sequence = [chr(code) for code in ascii_items]

            decoded_patterns.append({
                "Pattern": ' → '.join(sequence),
                "Support": int(support)
            })

    return decoded_patterns

decoded_spade = decode_spade_output()
decoded_spade_sorted = sorted(decoded_spade, key=lambda x: x['Support'], reverse=True)

print("20 SPADE patterns with highest support:\n")
for pattern in decoded_spade_sorted[:20]:
    print(f"Pattern: {pattern['Pattern']}\nSupport: {pattern['Support']}\n")

pd.DataFrame(decoded_spade_sorted).to_csv("decoded_spade_patterns.csv", index=False)
pd.DataFrame(decoded_spade_sorted).to_excel("decoded_spade_patterns.xlsx", index=False)

20 SPADE patterns with highest support:

Pattern: #
Support: 10000

Pattern: %
Support: 10000

Pattern: &
Support: 10000

Pattern: )
Support: 10000

Pattern: & → )
Support: 10000

Pattern: % → )
Support: 10000

Pattern: # → )
Support: 10000

Pattern: % → &
Support: 10000

Pattern: # → &
Support: 10000

Pattern: # → %
Support: 10000

Pattern: % → & → )
Support: 10000

Pattern: # → % → &
Support: 10000

Pattern: # → % → )
Support: 10000

Pattern: # → % → & → )
Support: 10000

Pattern: # → & → )
Support: 10000

Pattern: (
Support: 9949

Pattern: ( → )
Support: 9949

Pattern: & → (
Support: 9949

Pattern: % → (
Support: 9949

Pattern: # → (
Support: 9949



In [212]:
# Süre karşılaştırması

# Rule mining: (Minsup: 0.6, Minconf: 0.9, Max antecedent size: 10, Max consequent size: 10)
    # RuleGrowth: 1,703 seconds
    # CMRules: 2,226 seconds

# Pattern mining:
    # SPADE: 0,385 seconds
    # GSP: 0,697 seconds
    # PrefixSpan: 2.739 seconds