In [1]:
import pandas as pd
import json
import openpyxl
import time
import os.path
import glob
import natsort
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth
from scipy import spatial
from preprocess import Preprocess

# algorithm parameter setting
MIN_SUPPORT = 0.2
MAX_LEN = 10
MIN_LEN = 2

# rule construction setting
MIN_TIMES = 5

# rule filter setting
K = 20

# file read setting
TRAIN = 75
TEST = 25
DATA_PATH = 'Datafile_0222/'
SHEET_NAME = 'Data_1'

# default setting
DATA_FILE_NAME = []
DATA_FILE_LIST = []


def estimate_file_fail_train():
    '''
    Estimate the file rate of training files.
    '''
    print("\n##############  1. Estimate Training File Fail Rate Process ##############\n")
    path = 'Result'
    if not os.path.isdir(path):
        os.makedirs(path)

    train_fail_rate_result_file = 'Result/train_fail_rate_result_file.json'

    # if the training files have not been estimated before
    if not os.path.isfile(train_fail_rate_result_file):
        all_train_fail_rate_result_file_dic = {}

        for a in range(TRAIN):
            cur_file = DATA_FILE_NAME[a]
            cur_path = DATA_PATH + cur_file
            print(" Estimating training file " +
                  cur_file + " ")
            # estimate the fail rate of each training file
            p = Preprocess(
                file_name=cur_path,
                min_times=MIN_TIMES,
                sheetname=SHEET_NAME)

            train_fail_rate_dic = p.estimate_fail()
            all_train_fail_rate_result_file_dic[cur_file] = train_fail_rate_dic

        # write file
        json.dump(all_train_fail_rate_result_file_dic, open(
            train_fail_rate_result_file, "w"))

    else:
        print('--------> Training fail rate has been estimated before.')
        all_train_fail_rate_result_file_dic = json.load(
            open(train_fail_rate_result_file, "r"))

    return all_train_fail_rate_result_file_dic


def estimate_file_fail_test():
    '''
    Estimate the file rate of test files.
    '''
    print("\n##############  5. Estimate testing file fail rate process ##############\n")

    all_test_fail_rate_result_file_dic = {}

    for cut_point in range(TEST):
        cur_file = DATA_FILE_NAME[cut_point+TRAIN]
        cur_path = DATA_PATH + cur_file
        print(" Estimating testing file " +
              cur_file + " ")
        # estimate the fail rate of each testing file
        p = Preprocess(
            file_name=cur_path,
            min_times=MIN_TIMES,
            sheetname=SHEET_NAME)

        test_fail_rate_dic = p.estimate_fail()
        # print('testData_' + str(a+1), test_fail_rate_dic)

        all_test_fail_rate_result_file_dic[cur_file] = test_fail_rate_dic

    # write file
    #json.dump(all_test_fail_rate_result_file_dic, open(test_fail_rate_result_file_file, "w"))
    return all_test_fail_rate_result_file_dic


def formulate_input(dic):
    '''
    Formulate fail rate dictionary to the input of the algorithm.
    '''
    print("\n##############  2. Input the dataset formula  ############################")
    input_dataset = []

    for file in list(dic):
        input_dataset.append(list(dic[file]))
    print("\n  Dataset formula process finished.  ")
    return input_dataset


def rule_mining(dataset):
    '''
    Implement fp-growth.
    '''
    print("\n##############  3. Implement FP-growth process  ##########################")
    te = TransactionEncoder()
    te_ary = te.fit(dataset).transform(dataset)
    df = pd.DataFrame(te_ary, columns=te.columns_)

    df_rules = fpgrowth(df, min_support=MIN_SUPPORT,
                        max_len=MAX_LEN, use_colnames=True)
    print(df_rules)
    # sorted by support values
    df_rules = df_rules.sort_values(by='support', ascending=False)
    # create new df
    df_filter_rules = pd.DataFrame(columns=['support', 'itemsets'])

    # filer with the minimum length
    for index, row in df_rules.iterrows():
        if len(df_filter_rules.index) < K and len(list(row['itemsets'])) >= MIN_LEN:
            df_filter_rules.loc[len(df_filter_rules.index)] = \
                [row['support'], list(row['itemsets'])]
    print("\n  FP-growth process finished.  ")
    return df_filter_rules


def train_score_calculation(df_filter_rules, fail_result_dic):
    '''
    Estimate the score of training files.
    '''
    print("\n##############  4. Estimate the score of training files  ##################")

    training_file_score_dic = {}
    for r in range(TRAIN):
        cur_file = DATA_FILE_NAME[r]
        scoreList = []
        # print(fail_result_dic.get(cur_file))
        for j in range(K):
            score = 1
            for i in range(len(df_filter_rules['itemsets'][j])):
                tmp = df_filter_rules['itemsets'][j][i]
                if fail_result_dic.get(cur_file).get(tmp) is not None:
                    score *= fail_result_dic.get(cur_file).get(tmp)
                else:
                    score = 0.0
            scoreList.append(score)
        training_file_score_dic[cur_file] = scoreList
    print("\n  Estimation training score Finished.  ")

    return training_file_score_dic


def test_score_calculation(df_filter_rules, fail_result_dic):
    '''
    Estimate the score of testing files.
    '''

    print("\n##############  6. Estimate the score of testing files  ###################")
    test_file_score_dic = {}
    for r in range(TEST):
        cur_file = DATA_FILE_NAME[r+TRAIN]
        scoreList = []
        for j in range(K):
            score = 1
            for i in range(len(df_filter_rules['itemsets'][j])):
                tmp = df_filter_rules['itemsets'][j][i]
                if fail_result_dic.get(cur_file).get(tmp) is not None:
                    score *= fail_result_dic.get(cur_file).get(tmp)
                else:
                    score = 0.0
            scoreList.append(score)
        test_file_score_dic[cur_file] = scoreList

    print("\n  Estimation testing score Finished.  ")

    return test_file_score_dic


def similar(score_file_score_dic, test_file_score_dic):
    '''
    Similarity comparison and ranking.
    '''

    print("\n##############  7. Estimate the similarity between the Training/Testing file #####################")
    training_file_score = score_file_score_dic
    totalRanking_dic = {}
    for w in range(TEST):
        cur_file = DATA_FILE_NAME[w+TRAIN]
        dataSetI = test_file_score_dic.get(cur_file)
        rankingList = {}
        for p in range(TRAIN):
            train_file = DATA_FILE_NAME[p]
            dataSetII = training_file_score.get(train_file)
            result = 1 - spatial.distance.cosine(dataSetI, dataSetII)
            rankingList[train_file] = result
        afterRanking = dict(sorted(rankingList.items(),
                                   key=lambda item: item[1], reverse=True))
        totalRanking_dic[cur_file] = afterRanking

    jsonFile = open("Result/rankingResult.json", "w")
    jsonFile.write(json.dumps(totalRanking_dic, indent=2))
    jsonFile.close()
    print("\n  Estimation of similarity finished. \n ")


In [2]:
DATA_FILE_LIST = glob.glob(os.path.join(DATA_PATH, "*.xlsx"))
DATA_FILE_LIST = natsort.natsorted(DATA_FILE_LIST)
for file_path in DATA_FILE_LIST:
    DATA_FILE_NAME.append(os.path.basename(file_path))

#========================= TRAIN ===============================#
# Estimate fail rates of training files
all_train_fail_rate_result_file_dic = estimate_file_fail_train()
# Transfer the dictionary to algorithm input
input_dataset_train = formulate_input(all_train_fail_rate_result_file_dic)
    


##############  1. Estimate Training File Fail Rate Process ##############

--------> Training fail rate has been estimated before.

##############  2. Input the dataset formula  ############################

  Dataset formula process finished.  


In [3]:
df_rules = rule_mining(input_dataset_train)


##############  3. Implement FP-growth process  ##########################
       support                                           itemsets
0     0.400000                        (Init_Test_PvluLrGxFSeQXxs)
1     0.386667                        (Init_Test_nKhtqDhAqhpXRoB)
2     0.386667                        (Init_Test_PeYvNviBcAOXcoQ)
3     0.386667                        (Init_Test_GisqJogMpeIqvRH)
4     0.386667                        (Init_Test_vOPjhtZSHmDWEtj)
...        ...                                                ...
4527  0.200000  (Init_Test_JFrdeOvNotvnOWX, Init_Test_YBEdStIC...
4528  0.200000  (Init_Test_PvluLrGxFSeQXxs, Init_Test_JFrdeOvN...
4529  0.200000  (Init_Test_VWiekrNVjEqDjhe, Init_Test_bqMaiLbA...
4530  0.200000  (Init_Test_PeYvNviBcAOXcoQ, Init_Test_JAHLIeaJ...
4531  0.200000  (Init_Test_wgBjpWsLnFBWkRf, Init_Test_wepnFjTZ...

[4532 rows x 2 columns]

  FP-growth process finished.  


In [4]:
def countValue(array,target,dict_all):
  count = 0

  if not isinstance(target,list):
    target = [target]

  # print(target)
  temp = frozenset(target)
  if temp not in dict_all.keys():
    for i in array:
      for j in target:
        # print(f"j : {j}")
        if j not in i:
          break
        else:
          # print(f"in else : {target[-1]}")
          if j == target[-1]:
            count += 1
    dict_all[temp] = count
    return count
  else:
    return dict_all[temp]


In [5]:
def createTargetSource(subList,allList):
  for i in subList:
    allList.remove(i)
  return (frozenset(subList),frozenset(allList))

In [6]:
import itertools
def genAllRule(container,frequent_set, count_dict = {}):
  maximum = 2
  rule = {}
  answer = []
  for choose_list in frequent_set:
    if not isinstance(choose_list, list) or len(choose_list) <= 1:
      continue
    else:
      for i in range(1,len(choose_list)):
        target = itertools.combinations(choose_list, i)
        for j in target:
          temp_list = choose_list[:]
          temp = createTargetSource(list(j),temp_list)
          if temp in rule:
            break
          else:
            parent = countValue(container,list(j),count_dict)
            children = countValue(container,choose_list,count_dict)
            print(children,parent)
            confidence = children / parent
            lift = confidence / (countValue(container,list(temp[1]),count_dict) / len(container))
            support = children / len(container)
            print(f"{choose_list} / {j} = {children} / {parent} = {confidence}")
            rule[temp] = children / parent
            answer.append(([list(j),list(temp[1]),str(round(confidence,1)).split('.')[1]]))
  return rule, answer

In [7]:
setList = []
for i in df_rules['itemsets']:
    setList.append(i)
rule, answer = genAllRule(input_dataset_train,setList)


17 26
['Init_Test_isEmPDzHZNkqweG', 'Init_Test_iTnkyIDAvUXgZBD'] / ('Init_Test_isEmPDzHZNkqweG',) = 17 / 26 = 0.6538461538461539
17 27
['Init_Test_isEmPDzHZNkqweG', 'Init_Test_iTnkyIDAvUXgZBD'] / ('Init_Test_iTnkyIDAvUXgZBD',) = 17 / 27 = 0.6296296296296297
17 28
['Init_Test_DIYXVNmJHMsGaCH', 'Init_Test_bWTcEiwjInFaGFt'] / ('Init_Test_DIYXVNmJHMsGaCH',) = 17 / 28 = 0.6071428571428571
17 25
['Init_Test_DIYXVNmJHMsGaCH', 'Init_Test_bWTcEiwjInFaGFt'] / ('Init_Test_bWTcEiwjInFaGFt',) = 17 / 25 = 0.68
17 29
['Init_Test_PeYvNviBcAOXcoQ', 'Init_Test_SEChSSazcWoSnbg'] / ('Init_Test_PeYvNviBcAOXcoQ',) = 17 / 29 = 0.5862068965517241
17 24
['Init_Test_PeYvNviBcAOXcoQ', 'Init_Test_SEChSSazcWoSnbg'] / ('Init_Test_SEChSSazcWoSnbg',) = 17 / 24 = 0.7083333333333334
17 30
['Init_Test_JnQThYkFhklnBkE', 'Init_Test_DrqobvatwhashXq'] / ('Init_Test_JnQThYkFhklnBkE',) = 17 / 30 = 0.5666666666666667
17 30
['Init_Test_JnQThYkFhklnBkE', 'Init_Test_DrqobvatwhashXq'] / ('Init_Test_DrqobvatwhashXq',) = 17 / 30 = 0

In [8]:
proc_data = []
for rule in answer:
    PREC = 2
    a, c, conf = rule
    proc_data.append([a, c, conf])

filename = "./output.csv"
import csv
with open(filename, 'w') as f:
    writer = csv.writer(f)
    # rule format: antecedent --> consequent
    writer.writerow(["antecedent", "consequent",
                    "confidence"])
    writer.writerows(proc_data)

# Generate rule and confidence directly from fpgrowth output

In [9]:
te = TransactionEncoder()
te_ary = te.fit(input_dataset_train).transform(input_dataset_train)
df = pd.DataFrame(te_ary, columns=te.columns_)
df_rules_all = fpgrowth(df, min_support=MIN_SUPPORT,
                        max_len=MAX_LEN, use_colnames=True)
    

In [10]:
len(input_dataset_train)

75

In [11]:
setList_all = []
for i in df_rules_all['itemsets']:
    if(len(i) > 1):
        setList_all.append(list(i))
rule_all, answer_all = genAllRule(input_dataset_train,setList_all)


15 30
['Init_Test_DrqobvatwhashXq', 'Init_Test_nKhtqDhAqhpXRoB'] / ('Init_Test_DrqobvatwhashXq',) = 15 / 30 = 0.5
15 29
['Init_Test_DrqobvatwhashXq', 'Init_Test_nKhtqDhAqhpXRoB'] / ('Init_Test_nKhtqDhAqhpXRoB',) = 15 / 29 = 0.5172413793103449
15 29
['Init_Test_vOPjhtZSHmDWEtj', 'Init_Test_GisqJogMpeIqvRH'] / ('Init_Test_vOPjhtZSHmDWEtj',) = 15 / 29 = 0.5172413793103449
15 29
['Init_Test_vOPjhtZSHmDWEtj', 'Init_Test_GisqJogMpeIqvRH'] / ('Init_Test_GisqJogMpeIqvRH',) = 15 / 29 = 0.5172413793103449
16 30
['Init_Test_PvluLrGxFSeQXxs', 'Init_Test_OHcXnbpHqAOjgYK'] / ('Init_Test_PvluLrGxFSeQXxs',) = 16 / 30 = 0.5333333333333333
16 28
['Init_Test_PvluLrGxFSeQXxs', 'Init_Test_OHcXnbpHqAOjgYK'] / ('Init_Test_OHcXnbpHqAOjgYK',) = 16 / 28 = 0.5714285714285714
15 29
['Init_Test_GisqJogMpeIqvRH', 'Init_Test_zeZZnfovuhbQPES'] / ('Init_Test_GisqJogMpeIqvRH',) = 15 / 29 = 0.5172413793103449
15 28
['Init_Test_GisqJogMpeIqvRH', 'Init_Test_zeZZnfovuhbQPES'] / ('Init_Test_zeZZnfovuhbQPES',) = 15 / 28 = 0.

In [12]:
proc_data_all = []
for rule in answer_all:
    PREC = 2
    a, c, conf = rule
    proc_data_all.append([a, c, conf])

filename = "./output_all.csv"
import csv
with open(filename, 'w') as f:
    writer = csv.writer(f)
    # rule format: antecedent --> consequent
    writer.writerow(["antecedent", "consequent",
                    "confidence"])
    writer.writerows(proc_data_all)

# Build KG

In [13]:
from torchkge.data_structures import KnowledgeGraph as KG
def loadCsvandBuildKG(pathName):
    df = pd.read_csv(pathName,header=None, names=['from','to','rel'])
    kg = KG(df)
    return kg

  from tqdm.autonotebook import tqdm


In [14]:
kg_all = loadCsvandBuildKG('./output_all.csv')
kg_finish = loadCsvandBuildKG("./output.csv")

# Build KGE

In [15]:
from torch import cuda
from torch.optim import Adam
from torchkge.models import TransEModel
from torchkge.sampling import BernoulliNegativeSampler
from torchkge.utils import MarginLoss, DataLoader
from tqdm.autonotebook import tqdm

In [16]:
def train(hypermaterDict,kg_train,setCuda = False):
    # Define some hyper-parameters for training
    emb_dim = hypermaterDict["emb_dim"]
    lr = hypermaterDict["lr"]
    n_epochs = hypermaterDict["n_epochs"]
    b_size = hypermaterDict["b_size"]
    margin = hypermaterDict["margin"]

    # Define the model and criterion
    model = TransEModel(emb_dim, kg_train.n_ent, kg_train.n_rel, dissimilarity_type='L2')
    criterion = MarginLoss(margin)

    #Move everything to CUDA if available
    if setCuda:
        if cuda.is_available():
            cuda.empty_cache()
            model.cuda()
            criterion.cuda()

    # Define the torch optimizer to be used
    optimizer = Adam(model.parameters(), lr=lr, weight_decay=1e-5)

    sampler = BernoulliNegativeSampler(kg_train)
    if setCuda:
        dataloader = DataLoader(kg_train, batch_size=b_size, use_cuda='all')
    else:
        dataloader = DataLoader(kg_train, batch_size=b_size)

    iterator = tqdm(range(n_epochs), unit='epoch')
    for epoch in iterator:
        running_loss = 0.0
        for i, batch in enumerate(dataloader):
            h, t, r = batch[0], batch[1], batch[2]
            n_h, n_t = sampler.corrupt_batch(h, t, r)

            optimizer.zero_grad()

            # forward + backward + optimize
            pos, neg = model(h, t, r, n_h, n_t)
            loss = criterion(pos, neg)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        iterator.set_description(
            'Epoch {} | mean loss: {:.5f}'.format(epoch + 1,
                                                running_loss / len(dataloader)))

    model.normalize_parameters()
    return model

In [17]:
hypermaterDict = {
    "emb_dim" : 100,
    "lr" : 0.0004,
    "n_epochs" : 1000,
    "b_size" : 8,
    "margin" : 0.5
}

In [18]:
model_all = train(hypermaterDict,kg_all)
model_finish = train(hypermaterDict,kg_finish)

  0%|          | 0/1000 [00:00<?, ?epoch/s]

Epoch 1000 | mean loss: 0.07331: 100%|██████████| 1000/1000 [00:41<00:00, 23.86epoch/s]
Epoch 1000 | mean loss: 0.20441: 100%|██████████| 1000/1000 [00:08<00:00, 119.15epoch/s]
