In [36]:
import pandas as pd
import ast
import itertools
import operator

In [21]:
### Functions for getting data from files

## Get raw data from file
def get_data_from_file(file_name):
    file = open(file_name)
    content = file.read()
    lines = content.split('\n')
    file.close()
    
    return lines

## Get trajectories (default file name is trajectories.csv)
def get_trajectories(file_name = 'trajectories.csv'):
    trajectories = dict()
    lines = get_data_from_file(file_name)
    for (index, line) in enumerate(lines):
        cols = line.split('\t')
        if index == 0 or len(cols) <= 1: #Ignore title row and empty rows
            continue
        trajectories[cols[0]] = cols[1:]
    
    return trajectories        
        
## Get sections (default file name is diagnoses.csv)
def get_sections(file_name = 'diagnoses.csv'):
    person_sections = dict()
    lines = get_data_from_file(file_name)
    for (index, line) in enumerate(lines):
        cols = line.split('\t')
        if index == 0 or len(cols) <= 1: #Ignore title row and empty rows
            continue
        sections = ast.literal_eval(cols[-2])
        person_sections[cols[0]] = sections
        
    return person_sections

In [25]:
### Create containers for data

sections = get_sections()
trajectories = get_trajectories()

In [38]:
### Generate control data set

def get_overall_similartity(t1, t2):
    score = 0
    for t in t1:
        if t in t2:
            score += 1
    
    return score

## Return n number of best matching trajectory pairs 
## (if n is bigger than the length of trajectories, than all trajectories will be returned)
def get_control_set(control_items, n=100):
    pairs = list(itertools.combinations(control_items, 2)) #Get all possible pairs
    results = {}
    for pair in pairs:
        results[pair] = get_overall_similartity(pair[0], pair[1])
        
    return dict(sorted(results.iteritems(), key=operator.itemgetter(1), reverse=True)[:n])

control_set = get_control_set(trajectories)

control_set

AttributeError: 'dict' object has no attribute 'iteritems'

In [18]:
### Methods for comparing trajectories


## Get local alignment between person1(horizontal) and person2(vertical)
def get_local_alignment(p1, p2):
    #Parameters
    match = 1
    mismatch = 0
    gap_penalty = -1
    
    #Initialisation
    matrix = [[0 for i in range(len(p2) + 1)] for j in range(len(p1) + 1)]
    for i in range(len(p1) + 1):
        matrix[i][0] = i * gap_penalty
    for i in range(len(p2) + 1):
        matrix[0][i] = i * gap_penalty
        
    #Fill
    for i in range(len(p1)):
        for j in range(len(p2)):
            p1_diagnosis = p1[i][0]
            p2_diagnosis = p2[j][0]
            left = matrix[i][j+1] + gap_penalty
            right = matrix[i+1][j] + gap_penalty
            diagonal = matrix[i][j] + (match if p1_diagnosis == p2_diagnosis else mismatch)
            matrix[i+1][j+1] = max(left, right, diagonal, 0)
    
    #Find the biggest alignment value
    MAX = None
    for row in matrix:
        row_max = max(row)
        MAX = row_max if MAX == None or row_max > MAX else MAX
        
    return MAX

get_local_alignment(sections['48'], sections['2'])

2

In [94]:
### NB comparing people, there is no need for a matrix, because comparing p1 and p2 is the same as comparing p2 and p1


4

In [105]:
### Find n of the most similar ppl from trajectories and then with the algo and see how similar the result is

TypeError: '<' not supported between instances of 'int' and 'NoneType'

{'1': ['A00', 'G43', 'H65', 'I10', 'K13', 'Z04'],
 '2': ['A00', 'G43', 'I10', 'N85', 'Z04'],
 '3': ['A00', 'I10', 'K13', 'N85', 'Z04'],
 '4': ['A00', 'G43', 'I10', 'N85'],
 '6': ['A00', 'G43', 'K13', 'N85'],
 '8': ['G43', 'I10', 'K13'],
 '9': ['A00', 'G43', 'I10', 'K13'],
 '10': ['G43', 'I10', 'K13'],
 '12': ['G43'],
 '13': ['H65', 'K13'],
 '14': ['A00', 'I10', 'K13', 'N85', 'Z04'],
 '15': ['A00', 'G43', 'H65'],
 '16': ['A00', 'H65', 'K13'],
 '17': ['A00', 'I10', 'Z04'],
 '18': ['A00', 'H65', 'I10', 'K13'],
 '19': ['A00', 'H65', 'K13', 'N85'],
 '21': ['A00', 'H65', 'I10', 'K13'],
 '23': ['A00', 'K13', 'Z04'],
 '24': ['A00', 'G43', 'H65', 'I10', 'K13', 'N85'],
 '27': ['G43', 'H65', 'I10', 'Z04'],
 '29': ['A00', 'I10', 'K13', 'Z04'],
 '30': ['A00', 'G43', 'I10', 'K13'],
 '31': ['A00', 'G43', 'H65', 'Z04'],
 '33': ['A00', 'H65', 'K13', 'Z04'],
 '35': ['A00', 'G43', 'I10', 'K13'],
 '36': ['A00', 'G43', 'H65', 'I10', 'K13'],
 '37': ['A00', 'K13', 'Z04'],
 '44': ['A00', 'G43', 'H65', 'Z04'],

In [33]:
a = [1,2,3,4]
a[:9]

[1, 2, 3, 4]