In [1]:
import platform

print('You are using the recommended version for this script!' 
      if platform.python_version() == '3.9.6' else 
      'You are not using the recommended version for this script!')

You are using the recommended version for this script!


In [2]:
###TODO NB! There are no methods, all should be referred to as functions!!!!!

In [3]:
import pandas as pd
from tqdm import tqdm
import ast
import itertools
import operator
import math

In [4]:
### Functions for getting data from files

## Get raw data from file
def get_data_from_file(file_name):
    file = open(file_name)
    content = file.read()
    lines = content.split('\n')
    file.close()
    
    return lines

## Get trajectories (default file name is trajectories.csv)
def get_trajectories(file_name = 'trajectories.csv'):
    trajectories = dict()
    lines = get_data_from_file(file_name)
    for (index, line) in enumerate(lines):
        cols = line.split('\t')
        if index == 0 or len(cols) <= 1: #Ignore title row and empty rows
            continue
        trajectories[cols[0]] = cols[1:]
    
    return trajectories        
        
## Get sections (default file name is diagnoses.csv)
def get_sections(file_name = 'diagnoses.csv'):
    person_sections = dict()
    lines = get_data_from_file(file_name)
    for (index, line) in enumerate(lines):
        cols = line.split('\t')
        if index == 0 or len(cols) <= 1: #Ignore title row and empty rows
            continue
        sections = ast.literal_eval(cols[-2])
        person_sections[cols[0]] = sections
        
    return person_sections

In [5]:
### Create containers for data

sections = get_sections()
trajectories = get_trajectories()

# sections = get_sections('diagnoses_1.csv')
# trajectories = get_trajectories('trajectories_1.csv')

In [6]:
### Helper methods for testing comparing methods

## Return n number of best matching trajectory pairs 
## (if n is bigger than the length of trajectories, than all trajectories will be returned)
def helper_get_n_similar_trajectories(data, compare_function, n=100, name='funtion', **kwargs):
    pairs = list(itertools.combinations(data, 2)) #Get all possible pairs
    results = {}
    print('Started working on: %s' % (name))
    for pair in tqdm(pairs):
        results[pair] = compare_function(data[pair[0]], data[pair[1]], **kwargs)
        
    return dict(sorted(results.items(), key=operator.itemgetter(1), reverse=True)[:n])

In [7]:
### Function for geting the number of matches between two trajectories

def get_overall_similartity(t1, t2):
    score = 0
    for t in t1:
        if t in t2:
            score += 1
    
    return score

In [8]:
### Methods for comparing trajectories

## Import all algo implementations
from local_alignment import get_local_alignment
from model_alignment import get_model_alignment

## Import list compare methods
from compare_lists import *

0.34642857142857136


In [9]:
results = {}

In [10]:
### Test functions

n_coef = 1/3
n = math.ceil(len(sections) * n_coef) #Sample number

def alignment_template(fun, name, match=None, mismatch=None, gap_penalty=None):
    results[name] = helper_get_n_similar_trajectories(
        sections, fun, n, name, match=match, mismatch=mismatch, gap_penalty=gap_penalty)

control_set = helper_get_n_similar_trajectories(trajectories, get_overall_similartity, n, 'control_set')
# 25 - 31
# alignment_template(get_local_alignment, 'get_local_alignment_test_1_0_0', 1, 0, 0)
for i in range(30, 31):
    for j in range(99, 100):
        name = 'get_model_alignment_' + str(i+1) + '_' + str(j)
        results[name] = get_model_alignment(
                                            sections, 
                                            motive_limit=(0,999),
                                            max_motive_length=999,
                                            match_limit=(0, 999),
                                            order_reverse=False, 
                                            name=name
                                        )[:200000000000]



Started working on: control_set


100%|██████████████████████████████| 120786/120786 [00:00<00:00, 1848521.52it/s]

Started working on: get_model_alignment_31_99





In [11]:
### Find out how many inside treshold n are the same

def compare_pairs(pair1, pair2):
    return (str(pair1[0]) == str(pair2[0]) and str(pair1[1]) == str(pair2[1]) or 
                str(pair1[0]) == str(pair2[1]) and str(pair1[1]) == str(pair2[0]))

results_score = {}

for res in tqdm(control_set):
    for function in results:
        elements = results[function]
        for elem in elements:
            if compare_pairs(res, elem):
                try:
                    results_score[function] += 1
                except:
                    results_score[function] = 1
            else:
                try:
                    results_score[function] = results_score[function]
                except:
                    results_score[function] = 0
                
results_score


100%|█████████████████████████████████████████| 334/334 [01:11<00:00,  4.67it/s]


{'get_model_alignment_31_99': 334}

In [12]:
control_set

{('8', '25'): 7,
 ('8', '235'): 7,
 ('8', '378'): 7,
 ('8', '490'): 7,
 ('8', '754'): 7,
 ('8', '864'): 7,
 ('25', '235'): 7,
 ('25', '378'): 7,
 ('25', '490'): 7,
 ('25', '754'): 7,
 ('25', '864'): 7,
 ('235', '378'): 7,
 ('235', '490'): 7,
 ('235', '754'): 7,
 ('235', '864'): 7,
 ('378', '490'): 7,
 ('378', '754'): 7,
 ('378', '864'): 7,
 ('490', '754'): 7,
 ('490', '864'): 7,
 ('754', '864'): 7,
 ('8', '37'): 6,
 ('8', '59'): 6,
 ('8', '65'): 6,
 ('8', '82'): 6,
 ('8', '83'): 6,
 ('8', '98'): 6,
 ('8', '153'): 6,
 ('8', '199'): 6,
 ('8', '216'): 6,
 ('8', '270'): 6,
 ('8', '290'): 6,
 ('8', '291'): 6,
 ('8', '303'): 6,
 ('8', '304'): 6,
 ('8', '319'): 6,
 ('8', '332'): 6,
 ('8', '348'): 6,
 ('8', '375'): 6,
 ('8', '414'): 6,
 ('8', '416'): 6,
 ('8', '423'): 6,
 ('8', '446'): 6,
 ('8', '458'): 6,
 ('8', '459'): 6,
 ('8', '470'): 6,
 ('8', '499'): 6,
 ('8', '506'): 6,
 ('8', '612'): 6,
 ('8', '716'): 6,
 ('8', '731'): 6,
 ('8', '789'): 6,
 ('8', '821'): 6,
 ('8', '845'): 6,
 ('8', '85

In [13]:
count_lists_similarities([c for c in control_set], results['get_model_alignment'])

KeyError: 'get_model_alignment'

In [None]:
compare_lists_distance([c for c in control_set], results['get_model_alignment'])

In [None]:
get_spearmanr_coeficent([c for c in control_set], results['get_model_alignment'])