In [None]:
import platform

print('You are using the recommended version for this script!' 
      if platform.python_version() == '3.9.6' else 
      'You are not using the recommended version for this script!')

In [None]:
from tqdm import tqdm
import ast
import itertools

In [None]:
### Functions for getting data from files

## Get raw data from file
def get_data_from_file(file_name):
    file = open(file_name)
    content = file.read()
    lines = content.split('\n')
    file.close()
    
    return lines
     
        
## Get sections (default file name is diagnoses.csv)
def get_sections(file_name = 'diagnoses.csv'):
    person_sections = dict()
    lines = get_data_from_file(file_name)
    for (index, line) in enumerate(lines):
        cols = line.split('\t')
        if index == 0 or len(cols) <= 1: #Ignore title row and empty rows
            continue
        sections = ast.literal_eval(cols[-2])
        person_sections[cols[0]] = sections
        
    return person_sections

In [None]:
### Create containers for data

## Trajectories used in generating the data
from trajectories import get_control_trajectories
control_trajectories = get_control_trajectories()

## patient disease trajectories
sections = get_sections()

In [None]:
### Functions for comparing trajectories

## Local alignment
from local_alignment import get_local_alignment

In [None]:
### Function for geting similar trajectories inside sections, with helper functions

## Get the count of how many times a diagnose exists within the data
def get_counts(persons):
    counts = {}
    if len(persons.keys()):
        for p in persons:
            for diagnose in persons[p]:
                try:
                    counts[diagnose[0]] += 1
                except:
                    counts[diagnose[0]] = 1
    return counts

## Create a list that has all the significant diagnoses
def create_motive(counts, min, max):
    motive = []
    for count in counts:
        if counts[count] >= min and counts[count] <= max:
            motive.append(count)

    return motive

## Filter out diagnoses that are not included in the motive
def find_most_frequently_used_together_diagnoses(persons, motive):
    trajectories = []
    for p in persons:
        trajectory = []
        for diagnose in persons[p]:
            if diagnose[0] in motive:
                trajectory.append(diagnose[0])
        trajectories.append(trajectory)
    return trajectories

# Find the most common people by comparing everybody against everybody
def cluster_trajectories(trajectories):
    clusters = {}
    for (i, t1) in enumerate(tqdm(trajectories, desc="Started clustering")):
        for (j, t2) in enumerate(trajectories):
            if i != j:
                if get_local_alignment(t1, t2) >= 5:
                    try:
                        clusters[i] += [t2]
                    except:
                        clusters[i] = [t2]
    clusters = {k:v for (k,v) in clusters.items() if len(v) >= 6}

    return clusters

## Remove diagnoses that have no significance from trajectories
def clean_clusters_trajectories(clusters, trajectories):
    clean_trajectories = {}
    for c in clusters:
        trajectory = trajectories[c]
        scores = {}
        for t in clusters[c]:
            for diagnose in t:
                if diagnose in trajectory:
                    try:
                        scores[diagnose] += 1
                    except:
                        scores[diagnose] = 1
                    
        scores = {k:v for (k,v) in scores.items() if v >= 2} #len(clusters[c]) - 3
        clean_trajectories[c] = list(scores.keys())
    
    return clean_trajectories
    
def get_trajectories(sections, motive_limit = (2, 100)):
    counts = get_counts(sections)
    motive = create_motive(counts, motive_limit[0], motive_limit[1])
    trajectories = find_most_frequently_used_together_diagnoses(sections, motive)
    clusters = cluster_trajectories(trajectories)
    clean_trajectories = clean_clusters_trajectories(clusters, trajectories)
    
    return clusters, clean_trajectories



In [None]:
clusters, trajectories = get_trajectories(sections)

In [None]:
### Comparing how similar are the trajectories to the ones used as signal
hits = {}
for (i, t) in enumerate(trajectories):
    for (j, c_t) in enumerate(control_trajectories):
        score = get_local_alignment(trajectories[t], c_t)
        try:
            hits[score] += 1
        except:
            hits[score] = 1
hits

In [None]:
### Comparing all possible trajectories against the signal
all_trajectories = [[t[0] for t in sections[s]] for s in sections]
all_hits = {}
for (i, t) in enumerate(all_trajectories):
    for (j, c_t) in enumerate(control_trajectories):
        score = get_local_alignment(t, c_t)
        try:
            all_hits[score] += 1
        except:
            all_hits[score] = 1
all_hits

In [None]:
### Comparing how similar are trajectories in the same cluster
similarity_scores = {}
no_trajectories = 0

for c in clusters:
    if len(trajectories[c]) >= 3:
        no_trajectories += 1
        score = 0
        pairs = list(itertools.combinations(clusters[c], 2))
        for pair in pairs:
            score += get_local_alignment(pair[0], pair[1])
        similarity_scores[c] = score / len(pairs)
    
sum(similarity_scores.values()) / len(similarity_scores)

In [None]:
no_trajectories

In [None]:
### Comparing how similar are trajectories overall
score = 0
all_pairs = list(itertools.combinations(sections, 2))
for pair in tqdm(all_pairs, desc='Finding similarity in all pairs...'):
    score += get_local_alignment(sections[pair[0]], sections[pair[1]])

score / len(all_pairs)