In [1]:
import pandas as pd
import json
import openpyxl
import time
import os.path
import glob
import natsort
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth
from scipy import spatial
from preprocess import Preprocess

# algorithm parameter setting
MIN_SUPPORT = 0.2
MAX_LEN = 10
MIN_LEN = 2

# rule construction setting
MIN_TIMES = 5

# rule filter setting
K = 20

# file read setting
TRAIN = 75
TEST = 25
DATA_PATH = 'Datafile_0222/'
SHEET_NAME = 'Data_1'

# default setting
DATA_FILE_NAME = []
DATA_FILE_LIST = []


def estimate_file_fail_train():
    '''
    Estimate the file rate of training files.
    '''
    print("\n##############  1. Estimate Training File Fail Rate Process ##############\n")
    path = 'Result'
    if not os.path.isdir(path):
        os.makedirs(path)

    train_fail_rate_result_file = 'Result/train_fail_rate_result_file.json'

    # if the training files have not been estimated before
    if not os.path.isfile(train_fail_rate_result_file):
        all_train_fail_rate_result_file_dic = {}

        for a in range(TRAIN):
            cur_file = DATA_FILE_NAME[a]
            cur_path = DATA_PATH + cur_file
            print(" Estimating training file " +
                  cur_file + " ")
            # estimate the fail rate of each training file
            p = Preprocess(
                file_name=cur_path,
                min_times=MIN_TIMES,
                sheetname=SHEET_NAME)

            train_fail_rate_dic = p.estimate_fail()
            all_train_fail_rate_result_file_dic[cur_file] = train_fail_rate_dic

        # write file
        json.dump(all_train_fail_rate_result_file_dic, open(
            train_fail_rate_result_file, "w"))

    else:
        print('--------> Training fail rate has been estimated before.')
        all_train_fail_rate_result_file_dic = json.load(
            open(train_fail_rate_result_file, "r"))

    return all_train_fail_rate_result_file_dic


def estimate_file_fail_test():
    '''
    Estimate the file rate of test files.
    '''
    print("\n##############  5. Estimate testing file fail rate process ##############\n")

    all_test_fail_rate_result_file_dic = {}

    for cut_point in range(TEST):
        cur_file = DATA_FILE_NAME[cut_point+TRAIN]
        cur_path = DATA_PATH + cur_file
        print(" Estimating testing file " +
              cur_file + " ")
        # estimate the fail rate of each testing file
        p = Preprocess(
            file_name=cur_path,
            min_times=MIN_TIMES,
            sheetname=SHEET_NAME)

        test_fail_rate_dic = p.estimate_fail()
        # print('testData_' + str(a+1), test_fail_rate_dic)

        all_test_fail_rate_result_file_dic[cur_file] = test_fail_rate_dic

    # write file
    #json.dump(all_test_fail_rate_result_file_dic, open(test_fail_rate_result_file_file, "w"))
    return all_test_fail_rate_result_file_dic


def formulate_input(dic):
    '''
    Formulate fail rate dictionary to the input of the algorithm.
    '''
    print("\n##############  2. Input the dataset formula  ############################")
    input_dataset = []

    for file in list(dic):
        input_dataset.append(list(dic[file]))
    print("\n  Dataset formula process finished.  ")
    return input_dataset


def rule_mining(dataset):
    '''
    Implement fp-growth.
    '''
    print("\n##############  3. Implement FP-growth process  ##########################")
    te = TransactionEncoder()
    te_ary = te.fit(dataset).transform(dataset)
    df = pd.DataFrame(te_ary, columns=te.columns_)

    df_rules = fpgrowth(df, min_support=MIN_SUPPORT,
                        max_len=MAX_LEN, use_colnames=True)
    print(df_rules)
    # sorted by support values
    df_rules = df_rules.sort_values(by='support', ascending=False)
    # create new df
    df_filter_rules = pd.DataFrame(columns=['support', 'itemsets'])

    # filer with the minimum length
    for index, row in df_rules.iterrows():
        if len(df_filter_rules.index) < K and len(list(row['itemsets'])) >= MIN_LEN:
            df_filter_rules.loc[len(df_filter_rules.index)] = \
                [row['support'], list(row['itemsets'])]
    print("\n  FP-growth process finished.  ")
    return df_filter_rules


def train_score_calculation(df_filter_rules, fail_result_dic):
    '''
    Estimate the score of training files.
    '''
    print("\n##############  4. Estimate the score of training files  ##################")

    training_file_score_dic = {}
    for r in range(TRAIN):
        cur_file = DATA_FILE_NAME[r]
        scoreList = []
        # print(fail_result_dic.get(cur_file))
        for j in range(K):
            score = 1
            for i in range(len(df_filter_rules['itemsets'][j])):
                tmp = df_filter_rules['itemsets'][j][i]
                if fail_result_dic.get(cur_file).get(tmp) is not None:
                    score *= fail_result_dic.get(cur_file).get(tmp)
                else:
                    score = 0.0
            scoreList.append(score)
        training_file_score_dic[cur_file] = scoreList
    print("\n  Estimation training score Finished.  ")

    return training_file_score_dic


def test_score_calculation(df_filter_rules, fail_result_dic):
    '''
    Estimate the score of testing files.
    '''

    print("\n##############  6. Estimate the score of testing files  ###################")
    test_file_score_dic = {}
    for r in range(TEST):
        cur_file = DATA_FILE_NAME[r+TRAIN]
        scoreList = []
        for j in range(K):
            score = 1
            for i in range(len(df_filter_rules['itemsets'][j])):
                tmp = df_filter_rules['itemsets'][j][i]
                if fail_result_dic.get(cur_file).get(tmp) is not None:
                    score *= fail_result_dic.get(cur_file).get(tmp)
                else:
                    score = 0.0
            scoreList.append(score)
        test_file_score_dic[cur_file] = scoreList

    print("\n  Estimation testing score Finished.  ")

    return test_file_score_dic


def similar(score_file_score_dic, test_file_score_dic):
    '''
    Similarity comparison and ranking.
    '''

    print("\n##############  7. Estimate the similarity between the Training/Testing file #####################")
    training_file_score = score_file_score_dic
    totalRanking_dic = {}
    for w in range(TEST):
        cur_file = DATA_FILE_NAME[w+TRAIN]
        dataSetI = test_file_score_dic.get(cur_file)
        rankingList = {}
        for p in range(TRAIN):
            train_file = DATA_FILE_NAME[p]
            dataSetII = training_file_score.get(train_file)
            result = 1 - spatial.distance.cosine(dataSetI, dataSetII)
            rankingList[train_file] = result
        afterRanking = dict(sorted(rankingList.items(),
                                   key=lambda item: item[1], reverse=True))
        totalRanking_dic[cur_file] = afterRanking

    jsonFile = open("Result/rankingResult.json", "w")
    jsonFile.write(json.dumps(totalRanking_dic, indent=2))
    jsonFile.close()
    print("\n  Estimation of similarity finished. \n ")


In [2]:
DATA_FILE_LIST = glob.glob(os.path.join(DATA_PATH, "*.xlsx"))
DATA_FILE_LIST = natsort.natsorted(DATA_FILE_LIST)
for file_path in DATA_FILE_LIST:
    DATA_FILE_NAME.append(os.path.basename(file_path))

#========================= TRAIN ===============================#
# Estimate fail rates of training files
all_train_fail_rate_result_file_dic = estimate_file_fail_train()
# Transfer the dictionary to algorithm input
input_dataset_train = formulate_input(all_train_fail_rate_result_file_dic)
    


##############  1. Estimate Training File Fail Rate Process ##############

--------> Training fail rate has been estimated before.

##############  2. Input the dataset formula  ############################

  Dataset formula process finished.  


In [3]:
df_rules = rule_mining(input_dataset_train)


##############  3. Implement FP-growth process  ##########################
       support                                           itemsets
0     0.400000                        (Init_Test_PvluLrGxFSeQXxs)
1     0.386667                        (Init_Test_nKhtqDhAqhpXRoB)
2     0.386667                        (Init_Test_PeYvNviBcAOXcoQ)
3     0.386667                        (Init_Test_GisqJogMpeIqvRH)
4     0.386667                        (Init_Test_vOPjhtZSHmDWEtj)
...        ...                                                ...
4527  0.200000  (Init_Test_YBEdStICRTVhROS, Init_Test_JFrdeOvN...
4528  0.200000  (Init_Test_PvluLrGxFSeQXxs, Init_Test_JFrdeOvN...
4529  0.200000  (Init_Test_bqMaiLbADYmLGPg, Init_Test_VWiekrNV...
4530  0.200000  (Init_Test_PeYvNviBcAOXcoQ, Init_Test_JAHLIeaJ...
4531  0.200000  (Init_Test_wepnFjTZBcLruVd, Init_Test_wgBjpWsL...

[4532 rows x 2 columns]

  FP-growth process finished.  


In [5]:
te = TransactionEncoder()
te_ary = te.fit(input_dataset_train).transform(input_dataset_train)
df = pd.DataFrame(te_ary, columns=te.columns_)
df_rules_all = fpgrowth(df, min_support=MIN_SUPPORT,
                        max_len=MAX_LEN, use_colnames=True)

In [70]:
from mlxtend.frequent_patterns import association_rules

rule = association_rules(df_rules_all, metric="confidence", min_threshold=0.5)

In [35]:
rule.to_csv('rule.csv')

In [71]:
ruleCopy = rule.copy()
df = ruleCopy.iloc[:, [0,1,5]]
answer = []
for i in df.iloc:
    if len(list(i['antecedents'])) > 1:
        antecedentsArray = list(i['antecedents'])
        length = len(antecedentsArray)
        for j in range(length):
            for z in range(j+1,length):
                answer.append([list(antecedentsArray[j]), int(str(round(i['confidence'],1)).split('.')[1]),list(antecedentsArray[z])])       
                answer.append([list(antecedentsArray[z]), int(str(round(i['confidence'],1)).split('.')[1]),list(antecedentsArray[j])])       
            answer.append([list(antecedentsArray[j]), int(str(round(i['confidence'],1)).split('.')[1]),list(i['consequents'])])
    else:
        answer.append([list(i['antecedents']), int(str(round(i['confidence'],1)).split('.')[1]),list(i['consequents'])])
print(answer)
    

[[['Init_Test_nKhtqDhAqhpXRoB'], 5, ['Init_Test_DrqobvatwhashXq']], [['Init_Test_DrqobvatwhashXq'], 5, ['Init_Test_nKhtqDhAqhpXRoB']], [['Init_Test_vOPjhtZSHmDWEtj'], 5, ['Init_Test_GisqJogMpeIqvRH']], [['Init_Test_GisqJogMpeIqvRH'], 5, ['Init_Test_vOPjhtZSHmDWEtj']], [['Init_Test_PvluLrGxFSeQXxs'], 5, ['Init_Test_OHcXnbpHqAOjgYK']], [['Init_Test_OHcXnbpHqAOjgYK'], 6, ['Init_Test_PvluLrGxFSeQXxs']], [['Init_Test_zeZZnfovuhbQPES'], 5, ['Init_Test_GisqJogMpeIqvRH']], [['Init_Test_GisqJogMpeIqvRH'], 5, ['Init_Test_zeZZnfovuhbQPES']], [['Init_Test_GisqJogMpeIqvRH'], 6, ['Init_Test_LjOhIHgKkRAwewb']], [['Init_Test_LjOhIHgKkRAwewb'], 6, ['Init_Test_GisqJogMpeIqvRH']], [['Init_Test_zeZZnfovuhbQPES'], 5, ['Init_Test_LjOhIHgKkRAwewb']], [['Init_Test_LjOhIHgKkRAwewb'], 5, ['Init_Test_zeZZnfovuhbQPES']], [['Init_Test_zunyUarEjTVTaVo'], 6, ['Init_Test_PvluLrGxFSeQXxs']], [['Init_Test_PvluLrGxFSeQXxs'], 5, ['Init_Test_zunyUarEjTVTaVo']], [['Init_Test_zunyUarEjTVTaVo'], 5, ['Init_Test_vOPjhtZSHmDWEt

In [40]:
proc_data = []
for rule in answer:
    PREC = 2
    a, c, conf = rule
    proc_data.append([a, c, conf])

filename = "./output.csv"
import csv
with open(filename, 'w') as f:
    writer = csv.writer(f)
    # rule format: antecedent --> consequent
    writer.writerow(["antecedent", "confidence",
                    "consequent"])
    writer.writerows(proc_data)

# Build KG

In [58]:
from torchkge.data_structures import KnowledgeGraph as KG
def loadCsvandBuildKG(pathName):
    df = pd.read_csv(pathName,header=None, names=['from','rel','to'],skiprows=1)
    kg = KG(df)
    return kg

In [59]:
kg = loadCsvandBuildKG("./output.csv")

# Build KGE

In [60]:
from torch import cuda
from torch.optim import Adam
from torchkge.models import TransEModel
from torchkge.sampling import BernoulliNegativeSampler
from torchkge.utils import MarginLoss, DataLoader
from tqdm.autonotebook import tqdm

In [61]:
def train(hypermaterDict,kg_train,setCuda = False):
    # Define some hyper-parameters for training
    emb_dim = hypermaterDict["emb_dim"]
    lr = hypermaterDict["lr"]
    n_epochs = hypermaterDict["n_epochs"]
    b_size = hypermaterDict["b_size"]
    margin = hypermaterDict["margin"]

    # Define the model and criterion
    model = TransEModel(emb_dim, kg_train.n_ent, kg_train.n_rel, dissimilarity_type='L2')
    criterion = MarginLoss(margin)

    #Move everything to CUDA if available
    if setCuda:
        if cuda.is_available():
            cuda.empty_cache()
            model.cuda()
            criterion.cuda()

    # Define the torch optimizer to be used
    optimizer = Adam(model.parameters(), lr=lr, weight_decay=1e-5)

    sampler = BernoulliNegativeSampler(kg_train)
    if setCuda:
        dataloader = DataLoader(kg_train, batch_size=b_size, use_cuda='all')
    else:
        dataloader = DataLoader(kg_train, batch_size=b_size)

    iterator = tqdm(range(n_epochs), unit='epoch')
    for epoch in iterator:
        running_loss = 0.0
        for i, batch in enumerate(dataloader):
            h, t, r = batch[0], batch[1], batch[2]
            n_h, n_t = sampler.corrupt_batch(h, t, r)

            optimizer.zero_grad()

            # forward + backward + optimize
            pos, neg = model(h, t, r, n_h, n_t)
            loss = criterion(pos, neg)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        iterator.set_description(
            'Epoch {} | mean loss: {:.5f}'.format(epoch + 1,
                                                running_loss / len(dataloader)))

    model.normalize_parameters()
    return model

In [67]:
import numpy as np
def _save_emb_results(G, model):
    ent_emb, rel_emb = model.get_embeddings()
    ent_emb = ent_emb.cpu().numpy()
    rel_emb = rel_emb.cpu().numpy()

    # index to IDs of entities and relations
    ix2ent_dict = {value: key for key, value in G.ent2ix.items()}
    ix2rel_dict = {value: key for key, value in G.rel2ix.items()}

    np.savetxt("./Result/ent_emb.csv", ent_emb, delimiter="\t")
    np.savetxt("./Result/rel_emb.csv", rel_emb, delimiter="\t")
    return ix2ent_dict, ix2rel_dict, rel_emb, ent_emb

In [63]:
hypermaterDict = {
    "emb_dim" : 100,
    "lr" : 0.0004,
    "n_epochs" : 1000,
    "b_size" : 8,
    "margin" : 0.5
}

In [68]:
model = train(hypermaterDict,kg)
ix2ent_dict, ix2rel_dict, rel_emb, ent_emb = _save_emb_results(kg,model)

Epoch 1000 | mean loss: 0.05256: 100%|██████████| 1000/1000 [00:40<00:00, 24.84epoch/s]


In [65]:
print(ix2ent_dict)
print(ix2rel_dict)

{0: "['Init_Test_BLsEsxPUJKHkuhg']", 1: "['Init_Test_CEAtnglhQIUaeCb']", 2: "['Init_Test_CqJtQQHCjcPvzrt']", 3: "['Init_Test_DIYXVNmJHMsGaCH']", 4: "['Init_Test_DMrSWGOzxPHFIVZ']", 5: "['Init_Test_DrqobvatwhashXq']", 6: "['Init_Test_DznmCuKTLzOmkmM']", 7: "['Init_Test_ESrRXcDjxQyzIue']", 8: "['Init_Test_FIfauipIynCmuqZ']", 9: "['Init_Test_FaWiZOhkQOwodyQ']", 10: "['Init_Test_FmaHzuORbxDYJBz']", 11: "['Init_Test_FrzttUlpKMzEMDB']", 12: "['Init_Test_FuqDTdfadicjACh']", 13: "['Init_Test_FxpkRAMNRiqWfMC']", 14: "['Init_Test_GLnJcxTzltwYWcH']", 15: "['Init_Test_GNPrlltbrDiECBb']", 16: "['Init_Test_GVoUuIBODnykVOK']", 17: "['Init_Test_GisqJogMpeIqvRH']", 18: "['Init_Test_HOXNuzuGqdeTFGo']", 19: "['Init_Test_HSAPVGNhfjOvyJc']", 20: "['Init_Test_HUrZvDcSsjBImmA']", 21: "['Init_Test_IBevczmnbTZVAUk']", 22: "['Init_Test_IGztaxHaFZfoNZx']", 23: "['Init_Test_IRpbLqpiSoAJLQt']", 24: "['Init_Test_IWiMGlYhhuHGzmh']", 25: "['Init_Test_IdFoplWtHsIjtMd']", 26: "['Init_Test_IlEKbkUHNsqDigi']", 27: "['Ini