In [1]:
import sys
cpath = !pwd
sys.path.append('/usr/src/app/algorithms/')
sys.path.append('/usr/src/app/')

In [2]:
from lionforests import LionForests
from algorithms.simpleSurrogate import GlobalSurrogateTree, LocalSurrogateTree
from algorithms.DefragTrees.defragTrees import DefragModel
from IPython.display import clear_output

from scipy import sparse

from datasets.dataset import Dataset
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd 
import numpy as np
np.seterr(invalid='ignore')
import warnings
warnings.filterwarnings("ignore")
import time

from sklearn.model_selection import train_test_split, LeaveOneOut
from sklearn.metrics import f1_score, precision_score, mean_absolute_error

In [3]:
wine_q = Dataset()
X, y, feature_names, class_names = wine_q.load_wine_quality()

In [4]:
y.std()

0.8731880644450568

In [5]:
y = np.array(y)
#one_h_percent = int(min(10*len(X)/100,100))
#print("Instances:",one_h_percent)
new_fn = []
for i in feature_names:
    new_fn.append(i.replace(' ','_'))
feature_names = new_fn
feature_names

['fixed_acidity',
 'volatile_acidity',
 'citric_acid',
 'residual_sugar',
 'chlorides',
 'free_sulfur_dioxide',
 'total_sulfur_dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol']

In [6]:
from IPython.display import clear_output
def measure(X_train, X_test, y_train, y_test, feature_names, class_names, random_state=10):
    parameters = [{
            'max_depth': [5],
            'max_features': [None],
            'bootstrap': [True],
            'min_samples_leaf' : [5],
            'n_estimators': [500]#1000
    }]
    lf = LionForests(None, False, None, feature_names, class_names)
    lf.fit(X_train, y_train, params=parameters)    
    train = lf.utilizer.transform(X_train)
    test = lf.utilizer.transform(X_test)

    predictions = lf.model.predict(train)
    test_predictions = lf.model.predict(test)

    def techniques(model, train, y_train, predictions, test, feature_names, class_names, lf, task, random_state=10):

        #BaselineTechnique ==============================================================================
        
        gt = GlobalSurrogateTree(train, predictions, feature_names, task, random_state)
        lt = LocalSurrogateTree(train, predictions, feature_names, task, 50, random_state)
        #print('    GT and LT Ready')

        #DefragTechnique ================================================================================
        Kmax = 10
        splitter = DefragModel.parseSLtrees(model) # parse sklearn tree ensembles into the array of (feature index, threshold)
        #print('    Done Splitting Starting Defrag')
        #mdl = DefragModel(modeltype=task, maxitr=10, qitr=0, tol=1e-6, restart=2, verbose=0, njobs=7)
        mdl = DefragModel(modeltype=task, maxitr=4, qitr=0, tol=1e-6, restart=2, njobs=7, seed=random_state)
        mdl.fit(train, predictions, splitter, Kmax, fittype='FAB', featurename=feature_names)
        def def_cov(instances):
            ts = time.time()
            score, cover, coll = mdl.evaluate(instances[1:],lf.model.predict(instances[1:]))
            def_predictions = mdl.predict(instances[1:])
            length, nodes = mdl.find_rule_length(instances[0])
            max_len = len(length)
            comp = {1:'>',0:'<='}
            rules = {}
            for f in feature_names:
                rules[f] = []
            counter = 0
            for rule, node in list(zip(mdl.rule_,nodes)):
                for conj in range(len(node)):
                    if node[conj] and counter<=max_len-1:
                        rules[feature_names[int(rule[conj][0]-1)]].append([comp[int(rule[conj][1])],rule[conj][2]])
                        counter += 1
            new_rules = {}
            for k, v in rules.items():
                if len(v) == 1:
                    new_rules[k] = v
                else:
                    mmin = None
                    mmax = None
                    for value in v:
                        if value[0]=='<=':
                            if mmin is None or mmin > value[1]:
                                mmin = value[1]
                        if value[0]=='>':
                            if mmax is None or mmax < value[1]:
                                mmax = value[1]
                    if mmin is not None:
                        new_rules[k] = [['<=', mmin]]
                    if mmax is not None:
                        new_rules[k] = [['>', mmax]]
            te = time.time()
            #print(mdl)
            return length, cover, def_predictions, te-ts, new_rules
    
        #LionForests
        def lf_rule(instance):
            temp = lf.explain(instance, instance_qe=0.4, method='R1', instance_random_state=random_state)
            error = temp[-1]
            rule = {}
            for key,value in temp[5].items():
                rule[key] = [['<=',value[1]],['>',value[0]]]
            return rule, error
        print('    LF Ready')

        return {'gs':gt.rule,'ls':lt.rule,'lf':lf_rule, 'df': def_cov}
        
    interpretation = techniques(lf.model, train, y_train, predictions, test, feature_names, class_names, lf, 'regression', random_state)
    def rule_cov(instance, feature_names, rule):
        covered = True
        for k in range(len(instance)):
            feature = feature_names[k]
            if feature in rule.keys():
                if len(rule[feature]) == 2:
                    if instance[k] > rule[feature][0][1]: #<=
                        return 0
                    if instance[k] <= rule[feature][1][1]:#THIS <=
                        #covered = False
                        return 0
                elif rule[feature][0][0] == '>':
                    if instance[k] <= rule[feature][0][1]:
                        #covered = False
                        return 0
                else:
                    if instance[k] > rule[feature][0][1]: #<=
                        #covered = False
                        return 0
        
        return 1
    def rule_cov_LF(instance, feature_names, rule):
        covered = True
        for k in range(len(instance)):
            feature = feature_names[k]
            if feature in rule.keys():
                if len(rule[feature]) == 2:
                    if instance[k] > rule[feature][0][1]: #<=
                        return 0
                    if instance[k] < rule[feature][1][1]:#THIS <=
                        return 0
                elif rule[feature][0][0] == '>':
                    if instance[k] <= rule[feature][0][1]:
                        return 0
                else:
                    if instance[k] > rule[feature][0][1]: #<=
                        return 0
        return 1

    rule_generator = interpretation
    full_coverage = {'gs':0, 'ls':0, 'lf':0, 'df':0}
    rule_length = {'gs':0, 'ls':0, 'lf':0, 'df':0}
    f_mae = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
    time_response = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
    rules = {'gs':[], 'ls':[], 'lf':[], 'df':[]}

    x_train_temp = train
    x_test_temp = test

    y_train_temp = predictions
    y_test_temp = test_predictions
    
    x_train_temp_lf = lf.utilizer.inverse_transform(x_train_temp)
    x_test_temp_lf = lf.utilizer.inverse_transform(x_test_temp)    
    ktime = time.time()
    for test_ind in range(len(test)):
        if test_ind % 10 == 0:
            print(round(test_ind/(len(test))*100,2),'in:', time.time()-ktime)
        for name, method in rule_generator.items():
            if name == 'df':
                #FIX RULE LENGTH!
                rule, cover, predictions, te, new_rules = method(np.concatenate((x_test_temp[test_ind:test_ind+1],x_test_temp)))
                f_mae[name].append(mean_absolute_error(predictions,y_test_temp))
                full_coverage[name] = full_coverage[name] + cover
                rules['df'].append(new_rules)
            elif name == 'lf':
                ts = time.time()
                rule, error = method(x_test_temp_lf[test_ind])
                te = time.time() - ts
                coverage = 0
                mae = []
                co = 0
                for i in x_test_temp_lf:
                    res = rule_cov_LF(i, feature_names, rule)
                    coverage = coverage + res
                    if res == 1:
                        if str(error) != 'nan' and str(error) != 'None':
                            mae.append(error)
                    co = co + 1
                if len(mae) >= 1:
                    mae = np.array(mae)
                    f_mae[name].append(mae.mean())
                full_coverage[name] = full_coverage[name] + coverage/len(x_test_temp_lf)
                rules['lf'].append(rule)
            else:
                ts = time.time()
                rule, prediction = method(x_test_temp[test_ind])
                te = time.time() - ts
                coverage = 0
                error = []
                co = 0 
                for i in x_test_temp:
                    res = rule_cov(i, feature_names, rule)
                    coverage = coverage + res
                    error.append([prediction, y_test_temp[co]])
                    co = co + 1
                if len(error) >= 1:
                    error = np.array(error)
                    f_mae[name].append(mean_absolute_error(error[:,:1],error[:,1:]))
                full_coverage[name] = full_coverage[name] + coverage/len(x_test_temp)
                rules[name].append(rule)
                #clear_output()
            time_response[name].append(te)
            rule_length[name] = rule_length[name] + len(rule)
        #clear_output
    return rule_generator, full_coverage, rule_length, f_mae, time_response, rules

In [7]:
from IPython.display import clear_output
total_results2 = []
test_size_2 = []
for rand in [7, 10, 77]: #7
    total_results = []
    test_size = []
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=7)
    results = measure(X_train, X_test[:500], y_train, y_test[:500], feature_names, class_names, random_state=rand)
    total_results.append(results)
    test_size.append(len(X_test[:500]))
    test_size_2.append(test_size)
    total_results2.append(total_results)
    clear_output()

In [8]:
from utilities.lionforests_utility import path_similarity
rule_variance = {'gs':[], 'ls':[],'lf':[], 'df':[]}


folds = 0
test_size = []
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=7)

min_max_feature_values = {}
for i in range(len(feature_names)):
        min_max_feature_values[feature_names[i]] = [min(X_train[:, i]), max(X_train[:, i])] 
for name in ['gs', 'ls', 'lf', 'df']:
    for k in range(len(total_results2[0][folds][-1][name])):
        r1 = total_results2[0][0][-1][name][k]
        r2 = total_results2[1][0][-1][name][k]
        r3 = total_results2[2][0][-1][name][k]
        #try:
        rule_variance[name].append((path_similarity(r1, r2, feature_names, min_max_feature_values)+
                                       path_similarity(r1, r3, feature_names, min_max_feature_values)+
                                       path_similarity(r2, r3, feature_names, min_max_feature_values))/3)

In [9]:
f_full_coverage = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
f_rule_length = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
f_f_precision = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
f_f_time = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
for b in range(3):
    full_coverage = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
    rule_length = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
    f_precision = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
    f_time = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
    k = 0
    for i in total_results2[b]:
        for name, method in i[0].items():
            full_coverage[name].append(i[1][name]/test_size_2[b][k])
            rule_length[name].append(i[2][name]/test_size_2[b][k])
            f_precision[name].append(np.array(i[3][name]).mean())
            f_time[name].append(np.array(i[4][name]).mean())
        k = + 1
    for name, method in i[0].items():
        f_full_coverage[name].append(np.array(full_coverage[name]).mean())
        f_rule_length[name].append(np.array(rule_length[name]).mean())
        f_f_precision[name].append(np.array(f_precision[name]).mean())
        f_f_time[name].append(np.array(f_time[name]).mean())
for name, method in i[0].items():
    print(name,  '| %5.4f  %5.3f | %5.4f %5.3f | %5.4f  %5.3f | %5.4f  %5.3f | %5.4f  %5.3f' 
          % (np.array(f_full_coverage[name]).mean(),np.array(f_full_coverage[name]).std(),
             np.array(f_rule_length[name]).mean(),np.array(f_rule_length[name]).std(),
             np.array(f_f_precision[name]).mean(),np.array(f_f_precision[name]).std(),
             np.array(f_f_time[name]).mean(),np.array(f_f_time[name]).std(),
             np.array(rule_variance[name]).mean(),np.array(rule_variance[name]).std()))

gs | 0.0635  0.000 | 2.8880 0.000 | 0.5426  0.000 | 0.0004  0.000 | 0.9435  0.060
ls | 0.0785  0.002 | 3.1640 0.025 | 0.5409  0.001 | 0.9941  0.017 | 0.2785  2.868
lf | 0.0021  0.000 | 8.4280 0.000 | 0.2530  0.000 | 0.3941  0.002 | 1.0000  0.000
df | 0.9993  0.001 | 9.9613 1.685 | 0.2678  0.029 | 0.4035  0.028 | 0.4982  0.102


In [10]:
import csv

with open('wine.csv', 'w', encoding='UTF8') as f:
    writer = csv.writer(f)
    for name, method in i[0].items():
        writer.writerow([name, np.array(f_full_coverage[name]).mean(),np.array(f_full_coverage[name]).std(),
             np.array(f_rule_length[name]).mean(),np.array(f_rule_length[name]).std(),
             np.array(f_f_precision[name]).mean(),np.array(f_f_precision[name]).std(),
             np.array(f_f_time[name]).mean(),np.array(f_f_time[name]).std(),
             np.array(rule_variance[name]).mean(),np.array(rule_variance[name]).std()])

In [3]:
import re
import pickle
with open('hoc', 'rb') as handle:
    hoc_dict = pickle.load(handle)
    X= hoc_dict['x']

In [16]:
temp = X[-1]
temp.encode().decode('unicode_escape')

'Aberrant glucose metabolism characterized by high levels of glycolysis , even in the presence of oxygen , is an important hallmark of cancer . This metabolic reprogramming referred to as the Warburg effect is essential to the survival of tumor cells and provides them with substrates required for biomass generation . Molecular mechanisms responsible for this shift in glucose metabolism remain elusive . As described herein , we found that aberrant expression of the proinflammatory protein transglutaminase 2 ( TG2 ) is an important regulator of the Warburg effect in mammary epithelial cells . Mechanistically , TG2 regulated metabolic reprogramming by constitutively activating nuclear factor ( NF)-κB , which binds to the hypoxia-inducible factor ( HIF)-1α promoter and induces its expression even under normoxic conditions . TG2/NF-κB-induced increase in HIF-1α expression was associated with increased glucose uptake , increased lactate production and decreased oxygen consumption by mitochon

In [90]:
text = ' piceatannol has potent anti a-inflammatory , immuno.modulatory , anticancer and antiproliferative effects . however , little is known about the mechanism by which piceatannol inhibits invasion and metastasis . the aim of the current study was to investigate the effects of piceatannol on the expression of matrix metalloproteinase 9 ( mmp 9 ) in du145 human prostate cancer cells . the results revealed that mmp 9 activity was significantly increased in response to tumor necrosis factor !\u03b1 ( tnf a\u03b1 ) . however , treatment with piceatannol reversed tnf \u03b1  and mmp 9 induced gelatin zymography and its gene expression . in addition , a matrigel invasion assay determined that piceatannol reduces the tnf \u03b1 induced invasion of du145 cells . nuclear factor \u03ba b ( nf \u03bab ) is a significant transcription factor that regulates numerous genes involved in tumor cell invasion and metastasis . therefore , whether piceatannol acts on nf \u03bab to regulate mmp 9 gene expression was analyzed . the results revealed that piceatannol attenuates mmp 9 gene expression via the suppression of nf \u03bab activity . using a specific nf \u03bab inhibitor , pyrrolidine dithiocarbamate , it was confirmed that tnf \u03b1 induced mmp 9 gene expression is primarily regulated by nf \u03bab activation . piceatannol inhibited nf \u03bab activity by suppressing nuclear translocation of the nf \u03bab p65 and p50 subunits . furthermore , tnf \u03b1 induced akt phosphorylation was significantly downregulated in the presence of piceatannol . the akt inhibitor ly294002 caused a significant decrease in tnf \u03b1 induced nf \u03bab activity and mmp 9 gene expression . overall , these data suggest that piceatannol inhibits tnf \u03b1 induced invasion by suppression of mmp 9 activation via the akt mediated nf \u03bab pathway in du145 prostate cancer cells .'
#text = text.encode()
#text.tostring()

In [108]:
text = ' piceatannol has potent anti a-inflammatory , immuno.modulatory , anticancer and antiproliferative effects . however , little is known about the mechanism by which piceatannol inhibits invasion and metastasis . the aim of the current study was to investigate the effects of piceatannol on the expression of matrix metalloproteinase 9 ( mmp 9 ) in du145 human prostate cancer cells . the results revealed that mmp 9 activity was significantly increased in response to tumor necrosis factor !\u03b1 ( tnf a\u03b1 ) . however , treatment with piceatannol reversed tnf \u03b1  and mmp 9 induced gelatin zymography and its gene expression . in addition , a matrigel invasion assay determined that piceatannol reduces the tnf \u03b1 induced invasion of du145 cells . nuclear factor \u03ba b ( nf \u03bab ) is a significant transcription factor that regulates numerous genes involved in tumor cell invasion and metastasis . therefore , whether piceatannol acts on nf \u03bab to regulate mmp 9 gene expression was analyzed . the results revealed that piceatannol attenuates mmp 9 gene expression via the suppression of nf \u03bab activity . using a specific nf \u03bab inhibitor , pyrrolidine dithiocarbamate , it was confirmed that tnf \u03b1 induced mmp 9 gene expression is primarily regulated by nf \u03bab activation . piceatannol inhibited nf \u03bab activity by suppressing nuclear translocation of the nf \u03bab p65 and p50 subunits . furthermore , tnf \u03b1 induced akt phosphorylation was significantly downregulated in the presence of piceatannol . the akt inhibitor ly294002 caused a significant decrease in tnf \u03b1 induced nf \u03bab activity and mmp 9 gene expression . overall , these data suggest that piceatannol inhibits tnf \u03b1 induced invasion by suppression of mmp 9 activation via the akt mediated nf \u03bab pathway in du145 prostate cancer cells .'
print(text)
text = text.lower()
text = text.encode('UTF-8','replace')
print(text)
text = re.sub(rb'[^\x00-\x7F]+', rb'', text)
text = re.sub(rb"\s's\b", rb"'s", text)
text = re.sub(rb'(\S)\.', rb'\g<1>,', text)
text = re.sub(rb'\.(\S)', rb',\g<1>', text)
text = re.sub(rb'\-', rb' ', text)
text = text.decode()
text

 piceatannol has potent anti a-inflammatory , immuno.modulatory , anticancer and antiproliferative effects . however , little is known about the mechanism by which piceatannol inhibits invasion and metastasis . the aim of the current study was to investigate the effects of piceatannol on the expression of matrix metalloproteinase 9 ( mmp 9 ) in du145 human prostate cancer cells . the results revealed that mmp 9 activity was significantly increased in response to tumor necrosis factor !α ( tnf aα ) . however , treatment with piceatannol reversed tnf α  and mmp 9 induced gelatin zymography and its gene expression . in addition , a matrigel invasion assay determined that piceatannol reduces the tnf α induced invasion of du145 cells . nuclear factor κ b ( nf κb ) is a significant transcription factor that regulates numerous genes involved in tumor cell invasion and metastasis . therefore , whether piceatannol acts on nf κb to regulate mmp 9 gene expression was analyzed . the results reveal

' piceatannol has potent anti a inflammatory , immuno,modulatory , anticancer and antiproliferative effects . however , little is known about the mechanism by which piceatannol inhibits invasion and metastasis . the aim of the current study was to investigate the effects of piceatannol on the expression of matrix metalloproteinase 9 ( mmp 9 ) in du145 human prostate cancer cells . the results revealed that mmp 9 activity was significantly increased in response to tumor necrosis factor ! ( tnf a ) . however , treatment with piceatannol reversed tnf   and mmp 9 induced gelatin zymography and its gene expression . in addition , a matrigel invasion assay determined that piceatannol reduces the tnf  induced invasion of du145 cells . nuclear factor  b ( nf b ) is a significant transcription factor that regulates numerous genes involved in tumor cell invasion and metastasis . therefore , whether piceatannol acts on nf b to regulate mmp 9 gene expression was analyzed . the results revealed tha

In [45]:
text.copy()

AttributeError: 'str' object has no attribute 'copy'