In [1]:
import numpy as np
import pandas as pd
import logging
import pickle
from pgmpy.sampling import GibbsSampling
from pgmpy.models import MarkovModel, BayesianModel
from pgmpy.sampling import BayesianModelSampling
from pgmpy.factors.discrete import State
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import networkx as nx
from networkx.drawing.nx_pylab import draw_networkx
from networkx.algorithms.cycles import find_cycle
from pgmpy.estimators import BdeuScore, K2Score, BicScore
from pomegranate import *
logging.basicConfig(filename='train_test.log',level=logging.DEBUG)

In [2]:
def get_accuracy(model,X_test,y_test):
    pred = model.predict(X_test,n_jobs = 16)
    return accuracy_score(y_test,pred)
    
def check_dag(edge_list):
    G = nx.DiGraph(edge_list)
    try:
        cycle = find_cycle(G, orientation='original')
    except:
        return False
    return cycle

def generate_edges(child,parents):
    edge_list = []
    for parent in parents:
        edge_list.append([parent,child])
    return edge_list

In [3]:
def get_cycle_edge(cycle,score,list_of_edges):
    cycle_edge_scores = []
    for parent,child,direction in cycle:
        cycle_edge_scores.append(score.local_score(child,[parent]))
    min_index = cycle_edge_scores.index(min(cycle_edge_scores))
    low_score_edge = cycle[min_index][:-1]
    return low_score_edge

def get_model_architecture(df):
    score = K2Score(df)
    list_of_edges = []
    for i in df.columns:
        edge_scores = []
        for j in df.columns:
            if i!=j:
                sco =score.local_score(i,[j])
                edge_scores.append((i,j,sco))
        edge_scores.sort(key = lambda x:x[2],reverse = True)
        parents = [edge_scores[0][1]]
        best_score = edge_scores[0][2]
        for v in range(1,10):
            parents.append(edge_scores[v][1])
            new_score = score.local_score(i,parents)
            if new_score > best_score:
                best_score = new_score
            else:
                parents = parents[:-1]
                break
        list_of_edges += generate_edges(i,parents)
        cycle = check_dag(list_of_edges)
        while cycle:
            low_score_edge = get_cycle_edge(cycle,score,list_of_edges)
            list_of_edges.remove(list(low_score_edge))
            cycle = check_dag(list_of_edges)
    return list_of_edges

In [4]:
columns = ['Pneumothorax','Consolidation','Fracture','Lung Lesion','Enlarged Cardiomediastinum',
            'Pneumonia','Pleural Other','No Finding','Cardiomegaly','Lung Opacity',
            'Edema','Pleural Effusion','Atelectasis','Support Devices']

string_cols = ['Sex', 'Age', 'labels']

In [5]:
for column in columns:
    print(column)
    filename = "_".join([w.lower() for w in column.split(" ")])
    sample_df = pd.read_csv('rejection_sampled_'+filename+'.csv')
    feat_cols = sample_df.columns[sample_df.columns!='labels']
    X_train,X_test,y_train,y_test = train_test_split(sample_df[feat_cols],sample_df['labels'],test_size = 0.3,random_state = 43)
    X_train[column] = y_train
    list_of_edges = get_model_architecture(X_train)
    list_of_edges = tuple([tuple(i[1:]) for i in list_of_edges])
    model = MarkovNetwork.from_structure(X_train.values,list_of_edges)
    with open('rejection_markov_'+filename.lower()+'_model.pkl','wb') as f:
        pickle.dump(model,f,protocol=4)

Pneumothorax


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices