# COLLABORATIVE FILTERING
## Inspired by the research paper 
## "Time to CARE: a collaborative engine for practical disease prediction"

# Quick Introduction

#### *Collaborative Filtering is a recommender system designed to predict the preferences of a person based on the perferences of other similar users (sort of like netflix/amazon recommendations)*

#### *This technique is based on the assumption that people enjoy similar items as their peers*

#### *Applying it to healthcare data, we can generate predictions on other diseases based on a subset of similar patients*

#### *Well suited to disease prediction due to the known collaborative nature of diseases*

# Methods

<dt>1. Basic CARE framework</dt>

<dd>- Employ standard Collaborative Filtering applications</dd>

<dt>2. ICARE</dt>

<dd>- An iterative version of CARE that incorporates new elements of significance testing and ensemble methods</dd>

<dt>3. Time-sensitive ICARE</dt>

<dd>- ICARE system which uses exploits the known ordering of disease diagnoses</dd>

<dd>- These improvements make it applicable to long-term, diverse data</dd>

# Work Flow
![diagram](Diagram.png)

# Step 1: Structure the Data

In [1]:
import numpy as np
import pandas as pd
import time
import math
import sys
import csv
from datetime import datetime, date
from sklearn.preprocessing import LabelEncoder

# Some useful helper functions

In [2]:
def calculate_age(born):
    today = date.today()
    b_date = datetime.strptime(born, '%m/%d/%Y')
    return today.year - b_date.year - ((today.month, today.day) < (b_date.month, b_date.day))

def calculate_gender(gender):
    if gender == 'M':
        return 0
    elif gender == 'F':
        return 1
    else:
        return 2
    
def check_valid(code, disease_codes):

    if code == 0 or code == '-------':
        return 0

    if code in disease_codes:
        return code
    else:
        new_code = '0' + code
        if new_code in disease_codes:
            return new_code
        else:
            new_code2 = '0' + new_code
            if new_code2 in disease_codes:
                return new_code2
            else:
                return 0

def parse_diags(diag_list, disease_codes):
    new_list = []
    for diag in diag_list:
        new_diag = check_valid(diag, disease_codes)
        (new_list.append(new_diag) if new_diag is not 0 else 0)

    return new_list

# Visit class

In [3]:
class Visit:
    
    """Holds information about a visit for each patient"""

    def __init__(self, adj_date, visit):
        self.visit = visit
        self.adj_date = adj_date

    def getVisit(self):
        return self.visit

    def getDate(self):
        return self.adj_date

# Patient Class

In [4]:
class Patient:
    
    """Holds information regarding the details of each patient"""

    def __init__(self, mem_id, gender, age, visit, adj_date):
        self.visits = []
        self.mem_id = mem_id
        self.gender = gender
        self.age = age
        #v = Visit(adj_date, visit)
        self.visits.append(Visit(adj_date, visit))
        
    def getMemID(self):
        return self.mem_id

    def getGender(self):
        return self.gender

    def getAge(self):
        return self.age
    
    def getVisits(self):
        return self.visits
    
    def addVisit(self, adj_date, visit):
        self.visits.append(Visit(adj_date, visit))
        self.visits.sort(key=lambda x: x.adj_date)
        
    def getUnique(self):
        unique_codes = set()
        for visit in self.visits:
            unique_codes |= set(visit.getVisit())
        return unique_codes

# Sample line from a data file

#### TBD

# Parse CSV for dictionary

In [5]:
def parseCSV(categoryfile, labelfile):
    """
    USAGE
    categoryfile - The file provided by HCUP. Should be called '$dxref 2015.csv'
    labelfile - The file provided by HCUP. Should be called 'dxlabel 2015.csv'

    RETURNS
    dictionary - Dictionary mapping from {icd9 codes : HCUP category }
    i2d - Index to diagnosis.. { column index : ic9 diagnosis code }
    d2i - Diagnosis to index.. { ic0 diagnosis code : column index }
    
        * note * i2d and d2i includes the age and gender as the first 2 indexes
    """

    dictionary, better_dictionary, labels, d2i, i2d = {}, {}, {}, {}, {}
    diseases = []


    # add birthday & gender to the d2i and i2d dictionaries
    d2i['Age'] = 0
    i2d[0] = 'Age'
    d2i['Gender'] = 1
    i2d[1] = 'Gender'

    # parse the diagnosis code labels
    count = 0
    with open(labelfile, 'rb') as csvfile:
        datareader = csv.reader(csvfile)

        for row in datareader:
            if count > 3:
                labels[int(row[0])] = row[1]
                #maplabels[count-4] = int(row[0])
                diseases.append(row[1])

            count+=1

    csvfile.close()

    # parse the diagnosis codes file
    count = 0
    with open(categoryfile, 'rb') as csvfile:
        datareader = csv.reader(csvfile)

        for row in datareader:
            if count >= 3:
                row[0] = row[0].replace("'","").strip()
                row[1] = row[1].replace("'", "").strip()
                dictionary[row[0]] = labels[int(row[1])]
                better_dictionary[row[0]] = row[3]
                d2i[row[0]] = count - 1
                i2d[count - 1] = row[0]
            

            count+=1

    csvfile.close()

    return dictionary, i2d, d2i, diseases, better_dictionary


# Parsing CSV files, create the DataFrame

In [6]:
def cleanData(filename):
    relevant_columns = ['Member System ID', 'Adjudication Date', 'Patient Birth Date', \
                        'Patient Gender Code', 'Diagnosis One Code', \
                        'Diagnosis Two Code', 'Diagnosis Three Code', \
                        'Diagnosis Four Code', 'Diagnosis Five Code']

    df = pd.read_csv(file1, usecols=relevant_columns, dtype=np.str)\
            .drop_duplicates()\
            .reset_index().drop('index', axis=1).fillna(0)
    df = df[relevant_columns]
    df['Adjudication Date'] = pd.to_datetime(df['Adjudication Date'], format='%m/%d/%Y')
    df['Patient Birth Date'] = df['Patient Birth Date'].apply(calculate_age)
    df['Patient Gender Code'] = df['Patient Gender Code'].apply(calculate_gender)

    return df

# Create Patient Database

In [7]:
def createPatients(df, disease_codes):
    patients, diseases = {}, {}

    for row in df.itertuples():
        mem_id = row[1]
        adj_date = row[2]
        age = row[3]
        gender = row[4]
        visit = parse_diags(row[5:], disease_codes)
        
        for item in visit:
            if item not in diseases:
                diseases[item] = set()
            diseases[item].add(mem_id)

        if mem_id not in patients:
            p = Patient(mem_id, gender, age, visit, adj_date)
            patients[mem_id] = p
        else:
            patients[mem_id].addVisit(adj_date, visit)

    return patients, diseases

# Set up for CARE

In [8]:
def setupCARE(filename):
    categoryfile = '$dxref 2015.csv'
    labelfile = 'dxlabel 2015.csv'
    df = cleanData(filename)
    dic, i2d, d2i, foo_diseases, better_dic = parseCSV(categoryfile, labelfile)
    disease_codes = set(dic.keys())
    patients, diseases = createPatients(df, disease_codes)
    return patients, diseases, disease_codes, dic, better_dic

In [16]:
start_time = time.time()
file1 = 'file1.csv'
file2 = 'file2.csv'
file3 = 'file3.csv'
patients, diseases, disease_codes, dic, better_dic = setupCARE(file1)
print('--- %s seconds ---' %(time.time() - start_time))

--- 5.27904891968 seconds ---


# Step 2: Filtering for training set

#### *We obtain our training set by filtering out our patient database with users with at least 2 common diseases with the target patient*

#### *This serves to remove the influence of patients with little or no similarity with the target patient*

#### *Does not result in loss of information, and reduces runtime of the algorithm*

In [10]:
def train_filter(target, patients, diseases):
    # filter patients that have at least 2 common diseases
    patient_train = {}
    disease_train = {}
    target_diseases = target.getUnique()

    #for disease in target_diseases:
    #    disease_train[disease] = set()
    #    disease_train[disease].add(target.getMemID())
    
    for patient in patients.values():
        #patient_diseases = patient.getUnique()
        combined = target_diseases & patient.getUnique()
        if len(combined) >= 2:
            patient_train[patient.getMemID()] = patient
            for disease in patient.getUnique():
                if disease not in disease_train:
                    disease_train[disease] = set()
                disease_train[disease].add(patient.getMemID())
        
    return patient_train, disease_train

# C.A.R.E

# The meat of it all:

In [11]:
def implementCARE(target, patients, diseases, disease_codes):

    ### VECTOR SIMILARITY ###
    def vote(patient, disease):
        if disease in patient.getUnique():
            return 1.0
        else:
            return 0.0

    def f(j):
        """Returns: log(# of patients in database / # of patients with disease j)"""

        return np.log( (1.0)*len(patients) / len(diseases[j]) )


    # possible optimization: turn the getUnique() set into a numpy array, then do
    # array-wise multiplication...nah
    def w(a, i):
        total_sum = 0
        combined = a.getUnique() & i.getUnique()
        for disease in combined:
            first_half = f(disease) / math.sqrt(sum(f(k)**2 for k in a.getUnique()))
            second_half = f(disease) / math.sqrt(sum(f(k)**2 for k in i.getUnique()))
            total_sum += first_half * second_half
        return total_sum

    ### PREDICTION SCORE ###
    def V(j):
        return (1.0) * len(diseases[j]) / len(patients)

    def K(a):
        return 1.0 / (sum(w(a, i) for i in patients.values()))

    def p(a, j):
        return V(j) + K(a) * (1.0 - V(j)) * (sum(w(a, patients[i]) for i in diseases[j]))

    ### BEGIN PREDICTION ###
    # TODO: need to fix & filter diseases to the train_set
    #print(p(target, '6961'))
    disease_score = []
    for disease in diseases.keys():
        score = p(target, disease)
        disease_score.append([score, disease])
    
    disease_score.sort(key = lambda x: x[0], reverse=True)

    return disease_score

# Printing functions for patients

In [38]:
def printPatient(patient, dic):
    count = 1
    print('The patient has the following diseases:')
    for disease in patient.getUnique():
        print('\t%d. ' %count + dic[disease] + ' (' + disease + ')')
        count+=1
    print('\n')

In [48]:
def printDiseases(patient, predictedDiseases, dic):
    count = 1
    print('The patient has a possibility of getting the following 10 diseases:')
    for disease in predictedDiseases:
        if disease[1] in patient.getUnique():
            continue
        print('\t%d. ' %count + dic[disease[1]] + ' (' + disease[1] + \
              ') -- ' + '{0:.2f}'.format(disease[0]))
        count+=1
        if count == 11:
            return

# Step 3: Experiments

# Create our example patient to be used as the target

In [46]:
patient_zero = Patient('1', '0', '57', ['7020', '6989', '73300'], '05/31/1994')

printPatient(patient_zero, better_dic)

The patient has the following diseases:
	1. ACTINIC KERATOSIS (Begin 1991) (7020)
	2. OSTEOPOROSIS NOS (73300)
	3. PRURITIC DISORDER NOS (6989)




# Create our training sample size from the database based on our target patient

In [30]:
patient_train, disease_train = train_filter(patient_zero, patients, diseases)

# Make CARE prediction

In [49]:
predDisease = implementCARE(patient_zero, patient_train, disease_train, disease_codes)
#printPatient(patient_zero, better_dic)
printDiseases(patient_zero, predDisease[:20], better_dic)

The patient has a possibility of getting the following 10 diseases:
	1. OTHER PSORIASIS (6961) -- 1.00
	2. HYPERLIPIDEMIA NEC/NOS (2724) -- 0.32
	3. ROUTINE MEDICAL EXAM (V700) -- 0.28
	4. Routine physicl lab exam (Begin 2009) (V7262) -- 0.28
	5. BENIGN NEOPLASM SKIN NOS (2169) -- 0.26
	6. PRURIGO (6982) -- 0.26
	7. LOCAL SKIN INFECTION NOS (6869) -- 0.26
	8. GENERAL OSTEOARTHROSIS (71509) -- 0.26
	9. OTHER SKIN DISORDERS (Begin 1994) (70909) -- 0.22
	10. OTH CHRON SOLAR DERMATI (Begin 1992) (69274) -- 0.22


# OPTIMIZED VECTOR SIMILARITY (DONT RUN THIS EITHER)

In [42]:
def w_optimize(a, i):
    total_sum = 0
    combined = a.getUnique() & i.getUnique()
    print(combined)
    for disease in combined:
        first_half = f(disease) / math.sqrt(sum(f(k)**2 * vote(a, k)**2 for k in a.getUnique()))
        print(first_half)
        second_half = f(disease) / math.sqrt(sum(f(k)**2 * vote(i, k)**2 for k in i.getUnique()))
        print(second_half)
        total_sum += first_half * second_half
    return total_sum

# TRYING TO OPTIMIZE

###### Finding sum is way too expensive (need to loop thru 66,370 patients, then 4,000 possible diseases)

###### Thats a runtime of 265,480,000!!!!! (plus smaller loops not accounted for)

###### probs need to use np array, then do array multiplication then sum

###### j = 4000, worst case of J<sub>a</sub> or J<sub>i</sub> is 46. These two variables (J<sub>a</sub> & J<sub>i</sub>) are the diseases evident in each patient

###### num_patients = 8836

# VECTOR SIMILARITY (DONT RUN THIS)

In [23]:
def vote(patient, disease):
    if disease in patient.getUnique():
        return 1.0
    else:
        return 0.0

def f(j):
    """Returns: log(# of patients in database / # of patients with disease j)
    
    """
    return np.log( (1.0)*len(patient_train) / len(disease_train[j]) )

def w(a, i):
    total_sum = 0
    for j in diseases.keys():
        first_half = (f(j) * vote(a, j)) / math.sqrt(sum(f(k)**2 * vote(a, k)**2 for k in a.getUnique()))
        second_half = (f(j) * vote(i, j)) / math.sqrt(sum(f(k)**2 * vote(i, k)**2 for k in i.getUnique()))
        total_sum += first_half * second_half
    return total_sum

# PREDICTION SCORE (PLS DONT RUN THIS)

In [25]:
def V(j):
    return (1.0) * len(disease_train[j]) / len(patient_train)

def K(a):
    return 1.0 / (sum(w_optimize(a, i) for i in patient_train.values()))

def p(a, j):
    total = V(j) + K(a) * (1.0 - V(j))
    # gonna need a double for loop or something
    total = total * (sum(w_optimize(a, patient_train[i]) for i in disease_train[j]))
    return total
    #return V(j) + K(a) * (1.0 - V(j)) * (sum(w(a, i) for i in patients[diseases[j]]))

# TESTING FOR CORRECTNESS(DONT RUN THIS)

###### so we're trying to see if the training filters work correctly.

###### To do this, take the target patient, and look at all the diseases they have.

###### Now look at a patient in the training set. If more than 2 common diseases, should exist. Else, should not exist.

In [45]:
start_time = time.time()
#disease_code = ['4011', '25000', '2720', '6961', '2382', '7068', 'V5869']
disease_code = disease_train.keys()
disease_score = []
for i in disease_code:
    score = p(patient_zero, i)
    disease_score.append([score, i])

#print(score)
disease_score.sort(key=lambda x: x[0])
print(disease_score[:20])
print('--- %s seconds ---' %(time.time() - start_time))

[[0.11808826179198098, '78079'], [0.12783675801876943, 'V0481'], [0.1528603118765986, '30000'], [0.19128729945019815, '25002'], [0.23278714372129169, '6968'], [0.27434219801387738, '27800'], [0.2972912102105737, '2689'], [0.36655080203503471, '53081'], [0.58876326733091311, '2722'], [0.69235659664553695, '6960'], [1.1264787535348559, '2449'], [1.5415099458144201, 'V5869'], [1.7840775368738557, '5715'], [2.3939829504740628, 'V700'], [8.4511597775345848, '2724'], [9.9063547547541706, '4019'], [43.982989579477703, '2720'], [67.479329923218145, '25000'], [85.025261558745754, '4011'], [279.55186592532448, '6961']]
--- 308.983735085 seconds ---
