In [1]:
###TODO NB! There are no methods, all should be referred to as functions!!!!!

In [2]:
import pandas as pd
import ast
import itertools
import operator

In [3]:
### Functions for getting data from files

## Get raw data from file
def get_data_from_file(file_name):
    file = open(file_name)
    content = file.read()
    lines = content.split('\n')
    file.close()
    
    return lines

## Get trajectories (default file name is trajectories.csv)
def get_trajectories(file_name = 'trajectories.csv'):
    trajectories = dict()
    lines = get_data_from_file(file_name)
    for (index, line) in enumerate(lines):
        cols = line.split('\t')
        if index == 0 or len(cols) <= 1: #Ignore title row and empty rows
            continue
        trajectories[cols[0]] = cols[1:]
    
    return trajectories        
        
## Get sections (default file name is diagnoses.csv)
def get_sections(file_name = 'diagnoses.csv'):
    person_sections = dict()
    lines = get_data_from_file(file_name)
    for (index, line) in enumerate(lines):
        cols = line.split('\t')
        if index == 0 or len(cols) <= 1: #Ignore title row and empty rows
            continue
        sections = ast.literal_eval(cols[-2])
        person_sections[cols[0]] = sections
        
    return person_sections

In [4]:
### Create containers for data

sections = get_sections()
trajectories = get_trajectories()

In [5]:
### Helper methods for testing comparing methods

## Return n number of best matching trajectory pairs 
## (if n is bigger than the length of trajectories, than all trajectories will be returned)
def helper_get_n_similar_trajectories(data, compare_function, n=100, name='funtion'):
    pairs = list(itertools.combinations(data, 2)) #Get all possible pairs
    results = {}
    percent = 0
    for (index, pair) in enumerate(pairs):
        complete = int((index + 1) / len(pairs) * 100)
        if index == 0:
            print('Started working on: %s' % (name))
        elif complete == 100:
            print('%s has completed' % (name))
        elif complete % 10 == 0 and percent != complete:
            print(str(complete) + '% complete')
            percent = complete
        results[pair] = compare_function(data[pair[0]], data[pair[1]])
        
    return dict(sorted(results.items(), key=operator.itemgetter(1), reverse=True)[:n])

In [6]:
### Function for geting the number of matches between two trajectories

def get_overall_similartity(t1, t2):
    score = 0
    for t in t1:
        if t in t2:
            score += 1
    
    return score

In [7]:
### Methods for comparing trajectories

## Get local alignment between person1(horizontal) and person2(vertical)
def get_local_alignment(p1, p2):
    #Parameters
    match = 1
    mismatch = 0
    gap_penalty = -1
    
    #Initialisation
    matrix = [[0 for i in range(len(p2) + 1)] for j in range(len(p1) + 1)]
    for i in range(len(p1) + 1):
        matrix[i][0] = i * gap_penalty
    for i in range(len(p2) + 1):
        matrix[0][i] = i * gap_penalty
        
    #Fill
    for i in range(len(p1)):
        for j in range(len(p2)):
            p1_diagnosis = p1[i][0]
            p2_diagnosis = p2[j][0]
            left = matrix[i][j+1] + gap_penalty
            right = matrix[i+1][j] + gap_penalty
            diagonal = matrix[i][j] + (match if p1_diagnosis == p2_diagnosis else mismatch)
            matrix[i+1][j+1] = max(left, right, diagonal, 0)
    
    #Find the biggest alignment value
    MAX = None
    for row in matrix:
        row_max = max(row)
        MAX = row_max if MAX == None or row_max > MAX else MAX
        
    return MAX

In [8]:
### NB comparing people, there is no need for a matrix, because comparing p1 and p2 is the same as comparing p2 and p1
### Find n of the most similar ppl from trajectories and then with the algo and see how similar the result is

In [9]:
### Test functions

n = 30 #Sample number
control_set = helper_get_n_similar_trajectories(trajectories, get_overall_similartity, n, 'control_set')
test_function_1 = helper_get_n_similar_trajectories(sections, get_local_alignment, n, 'test_function_1')

Started working on: control_set
10% complete
20% complete
30% complete
40% complete
50% complete
60% complete
70% complete
80% complete
90% complete
control_set has completed
Started working on: test_function_1
10% complete
20% complete
30% complete
40% complete
50% complete
60% complete
70% complete
80% complete
90% complete
test_function_1 has completed


In [26]:
### Find out how many inside treshold n are the same

def compare_pairs(pair1, pair2):
    return pair1[0] == pair2[0] and pair1[1] == pair2[1] or pair1[0] == pair2[1] and pair1[1] == pair2[0]

results_for = {
    'test_function_1': 0
}

for res in control_set:
    for fun in results_for:
        elements = list(globals()[fun].keys())
        for elem in elements:
            print(compare_pairs(res, elem))
            if compare_pairs(res, elem):
                results_for[fun] += 1
                
results_for


False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fals

{'test_function_1': 0}

In [27]:
control_set

{('1', '88'): 6,
 ('1', '93'): 6,
 ('24', '113'): 6,
 ('24', '160'): 6,
 ('88', '93'): 6,
 ('113', '160'): 6,
 ('1', '24'): 5,
 ('1', '36'): 5,
 ('1', '46'): 5,
 ('1', '58'): 5,
 ('1', '60'): 5,
 ('1', '80'): 5,
 ('1', '91'): 5,
 ('1', '100'): 5,
 ('1', '113'): 5,
 ('1', '160'): 5,
 ('1', '179'): 5,
 ('1', '197'): 5,
 ('1', '212'): 5,
 ('1', '219'): 5,
 ('1', '220'): 5,
 ('2', '91'): 5,
 ('2', '242'): 5,
 ('3', '14'): 5,
 ('3', '46'): 5,
 ('3', '91'): 5,
 ('14', '46'): 5,
 ('14', '91'): 5,
 ('24', '36'): 5,
 ('24', '46'): 5}

In [28]:
test_function_1

{('156', '237'): 11,
 ('201', '237'): 11,
 ('156', '201'): 10,
 ('156', '252'): 9,
 ('237', '252'): 9,
 ('156', '245'): 8,
 ('9', '292'): 7,
 ('18', '156'): 7,
 ('18', '201'): 7,
 ('18', '252'): 7,
 ('25', '237'): 7,
 ('36', '93'): 7,
 ('60', '136'): 7,
 ('96', '136'): 7,
 ('108', '156'): 7,
 ('108', '201'): 7,
 ('201', '245'): 7,
 ('201', '252'): 7,
 ('237', '244'): 7,
 ('1', '18'): 6,
 ('2', '243'): 6,
 ('9', '10'): 6,
 ('9', '106'): 6,
 ('10', '17'): 6,
 ('14', '18'): 6,
 ('14', '201'): 6,
 ('14', '237'): 6,
 ('18', '46'): 6,
 ('18', '108'): 6,
 ('18', '237'): 6}