In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans


patients_df = pd.read_csv('../data/patients_data (1).csv')
doctors_df = pd.read_csv('../data/doctors-data.csv')


In [4]:
patients_df.head()

Unnamed: 0,patient_id,first_name,last_name,date_of_birth,location,language_preference,Insurance_plan,sex,ethnicity,doctor's_gender_preference,location_radious_prefernce,consultation_location,problem,budget_max,communication_preference
0,p1,Mark,Kirby,2/3/2003,Brookline,,Humana,F,Hispanic,No Preference,30 miles,virtual,,$300.00,less conversational
1,p2,Lauren,Smith,8/19/1941,Back Bay,,MassHealth,M,,F,30 miles,No Preference,fever,,more conversational
2,p3,Eric,Reynolds,12/1/2004,Cambridge,Mandarin,Humana,M,Hispanic,No Preference,50 miles,No Preference,headache,$500.00,less conversational
3,p4,Mark,Payne,5/28/1951,Roxbury,Russian,Blue Cross Blue Shield,F,Latino,F,30 miles,virtual,cough,$200.00,
4,p5,Bobby,Martinez,10/18/2005,Charlestown,Russian,MassHealth,M,White,M,No Preference,In-person,fatigue,,more conversational


In [5]:
doctors_df.head()

Unnamed: 0,doctor_id,first_name,last_name,date_of_birth,location,language_1,language_2,sex,ethnicity,availability,...,cost_min,cost_max,preventive_care,Insurance_Aetna,Insurance_Anthem,Insurance_Blue Cross Blue Shield,Insurance_Centene,Insurance_Cigna,Insurance_Humana,Insurance_Kaiser Permanente
0,d1,Crystal,Anthony,9/4/1983,Dorchester,English,Russian,F,Middle Eastern,evening,...,$30.00,$500.00,Yes,1,0,0,0,1,0,0
1,d2,Johnny,Gray,1/7/1975,South End,English,Spanish,M,,evening,...,$150.00,$200.00,No,0,0,1,0,1,0,0
2,d3,Mark,Short,1/15/1987,West End,English,French,Other,Asian,afternoon,...,$30.00,$200.00,No,1,1,0,0,1,0,1
3,d4,Roy,Burgess,10/17/1966,Dorchester,English,French,M,Hispanic,evening,...,$100.00,$200.00,Yes,0,1,1,1,1,0,1
4,d5,Richard,Watson,2/25/1976,South End,English,French,Other,Hispanic,afternoon,...,$150.00,$200.00,No,1,0,0,1,1,0,1


In [6]:
from datetime import datetime

def calculate_age(dob):
    try:
        dob = datetime.strptime(dob, "%m/%d/%Y")
        today = datetime.today()
        return today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day))
    except:
        return None


In [7]:
# Basic preprocessing steps for patients_data
patients_df['age'] = patients_df['date_of_birth'].apply(calculate_age)
patients_df['budget_max'] = patients_df['budget_max'].replace('[\$,]', '', regex=True).astype(float)

# Fill missing values for categorical data with 'Unknown' or the most frequent value ('mode')
categorical_columns_patients = ['location', 'language_preference', 'sex', 'ethnicity', 
                                'doctor\'s_gender_preference', 'location_radious_prefernce', 
                                'consultation_location', 'problem', 'communication_preference']
patients_df[categorical_columns_patients] = patients_df[categorical_columns_patients].fillna('Unknown')

# Basic preprocessing steps for doctors_data
doctors_df['age'] = doctors_df['date_of_birth'].apply(calculate_age)
doctors_df['cost_min'] = doctors_df['cost_min'].replace('[\$,]', '', regex=True).astype(float)
doctors_df['cost_max'] = doctors_df['cost_max'].replace('[\$,]', '', regex=True).astype(float)

# Fill missing values for categorical data with 'Unknown' or the most frequent value ('mode')
categorical_columns_doctors = ['location', 'language_1', 'language_2', 'sex', 'ethnicity', 'availability', 
                               'wait_times', 'review', 'speciality', 'hospital_affiliation1', 
                               'hospital_affiliation2', 'hospital_affiliation3', 'virtual_consultation', 
                               'accessibility', 'cleanliness', 'staff_friendliness', 'preventive_care']
doctors_df[categorical_columns_doctors] = doctors_df[categorical_columns_doctors].fillna('Unknown')



In [8]:
# Encode categorical variables using one-hot encoding for both dataframes
patients_df_encoded = pd.get_dummies(patients_df, columns=categorical_columns_patients)
doctors_df_encoded = pd.get_dummies(doctors_df, columns=categorical_columns_doctors)

# Now let's select the columns that we want to use for the collaborative filtering model
selected_columns_patients = [
    'date_of_birth', 'location', 'language_preference', 'Insurance_plan', 'sex', 
    'ethnicity', 'doctor\'s_gender_preference', 'location_radious_prefernce', 
    'consultation_location', 'problem', 'budget_max', 'communication_preference', 'age'
]

selected_columns_doctors = [
    'date_of_birth', 'location', 'language_1', 'language_2', 'sex', 'ethnicity', 
    'availability', 'wait_times', 'review', 'speciality', 'hospital_affiliation1', 
    'hospital_affiliation2', 'hospital_affiliation3', 'virtual_consultation', 
    'accessibility', 'cleanliness', 'staff_friendliness', 'cost_min', 'cost_max', 
    'preventive_care', 'Insurance_Aetna', 'Insurance_Anthem', 
    'Insurance_Blue Cross Blue Shield', 'Insurance_Centene', 'Insurance_Cigna', 
    'Insurance_Humana', 'Insurance_Kaiser Permanente', 'age'
]


In [9]:
# For patients data
patients_ohe_columns = [col for col in patients_df_encoded if col not in patients_df]
# For doctors data
doctors_ohe_columns = [col for col in doctors_df_encoded if col not in doctors_df]


In [10]:
# Now we select the necessary columns for the recommendation model
patients_model_columns = ['age', 'budget_max'] + patients_ohe_columns
doctors_model_columns = ['age', 'cost_min', 'cost_max'] + doctors_ohe_columns

# Creating the final dataframes for the model
patients_final_df = patients_df_encoded[patients_model_columns].fillna(0)
doctors_final_df = doctors_df_encoded[doctors_model_columns].fillna(0)

patients_final_df.head(), doctors_final_df.head()

(   age  budget_max  location_Back Bay  location_Beacon Hill  \
 0   21       300.0                  0                     0   
 1   82         0.0                  1                     0   
 2   19       500.0                  0                     0   
 3   72       200.0                  0                     0   
 4   18         0.0                  0                     0   
 
    location_Brighton  location_Brookline  location_Cambridge  \
 0                  0                   1                   0   
 1                  0                   0                   0   
 2                  0                   0                   1   
 3                  0                   0                   0   
 4                  0                   0                   0   
 
    location_Charlestown  location_Dorchester  location_Roxbury  ...  \
 0                     0                    0                 0  ...   
 1                     0                    0                 0  ...   
 2    

In [11]:
print(patients_final_df.info())
print(doctors_final_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 50 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   age                                           100 non-null    int64  
 1   budget_max                                    100 non-null    float64
 2   location_Back Bay                             100 non-null    uint8  
 3   location_Beacon Hill                          100 non-null    uint8  
 4   location_Brighton                             100 non-null    uint8  
 5   location_Brookline                            100 non-null    uint8  
 6   location_Cambridge                            100 non-null    uint8  
 7   location_Charlestown                          100 non-null    uint8  
 8   location_Dorchester                           100 non-null    uint8  
 9   location_Roxbury                              100 non-null    uint

In [17]:
doctors_final_df.to_csv('new_updated_doctors.csv', index=False)
patients_final_df.to_csv('new_updated_patients.csv', index=False)

In [47]:
# Compute user similarity matrix for patients
patient_features = patients_final_df.drop(['age', 'budget_max'], axis=1)  # Drop non-categorical columns for similarity
patient_similarity_matrix = cosine_similarity(patient_features)

In [48]:
def find_similar_patients(patient_index, similarity_matrix, top_n=5):
    patient_similarities = similarity_matrix[patient_index]
    similar_patients_indices = patient_similarities.argsort()[-top_n-1:-1][::-1]  # Exclude self, hence -1
    return similar_patients_indices, patient_similarities[similar_patients_indices]

In [49]:
def aggregate_preferences(similar_patient_indices, patient_df):
    # Simple aggregation could be mean of numerical features; more complex logic might be needed based on actual data
    aggregated_profile = patient_df.iloc[similar_patient_indices].mean(axis=0)
    return aggregated_profile


In [50]:
def recommend_doctors_for_patient(patient_index, patient_df, doctor_df, top_n=5):
    similar_patients_indices, _ = find_similar_patients(patient_index, patient_similarity_matrix, top_n)
    aggregated_preferences = aggregate_preferences(similar_patients_indices, patient_df)
    
    # Here, we would need to compare aggregated preferences to doctors' features.
    # This example assumes doctors' features are directly comparable to aggregated patient preferences.
    doctor_features = doctor_df.drop(['age', 'cost_min', 'cost_max'], axis=1)  # Assume these columns are comparable
    similarities_to_doctors = cosine_similarity([aggregated_preferences], doctor_features)[0]
    recommended_doctor_indices = similarities_to_doctors.argsort()[-top_n:][::-1]
    
    return doctor_df.iloc[recommended_doctor_indices]

In [51]:
recommended_doctors = recommend_doctors_for_patient(0, patients_final_df, doctors_final_df, top_n=5)
print(recommended_doctors)

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 50 while Y.shape[1] == 61

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Since we don't have explicit ratings or interactions between patients and doctors, we will use the features to compute similarity.

# Compute cosine similarity matrix for patients
patients_similarity = cosine_similarity(patients_final_df)

# Create a DataFrame for the similarity matrix for ease of use
patients_similarity_df = pd.DataFrame(patients_similarity, index=patients_df['patient_id'], columns=patients_df['patient_id'])

# Function to find top N similar patients for a given patient_id
def get_top_similar_patients(patient_id, top_n):
    # Get the similarity scores for the given patient_id
    similarity_scores = patients_similarity_df[patient_id]
    # Sort the scores in descending order and select top_n
    top_similar_patients = similarity_scores.sort_values(ascending=False)[1:top_n+1].index.tolist()
    return top_similar_patients

# Example: Get top 5 similar patients for patient with ID 'p1'
top_similar_patients_example = get_top_similar_patients('p1', 5)
top_similar_patients_example


['p31', 'p34', 'p48', 'p6', 'p12']

In [13]:
# Function to get similar patients for all patients
def get_similar_patients_for_all(top_n):
    similar_patients = {}
    for patient_id in patients_similarity_df.index:
        similar_patients[patient_id] = get_top_similar_patients(patient_id, top_n)
    return similar_patients

In [15]:
all_similar_patients = get_similar_patients_for_all(5)
all_similar_patients

{'p1': ['p31', 'p34', 'p48', 'p6', 'p12'],
 'p2': ['p84', 'p19', 'p94', 'p75', 'p59'],
 'p3': ['p32', 'p12', 'p48', 'p34', 'p1'],
 'p4': ['p43', 'p100', 'p49', 'p24', 'p80'],
 'p5': ['p10', 'p85', 'p83', 'p2', 'p41'],
 'p6': ['p55', 'p81', 'p21', 'p95', 'p31'],
 'p7': ['p75', 'p41', 'p53', 'p65', 'p59'],
 'p8': ['p18', 'p63', 'p25', 'p93', 'p20'],
 'p9': ['p88', 'p54', 'p99', 'p90', 'p16'],
 'p10': ['p75', 'p53', 'p41', 'p84', 'p59'],
 'p11': ['p27', 'p20', 'p28', 'p25', 'p18'],
 'p12': ['p32', 'p48', 'p3', 'p34', 'p1'],
 'p13': ['p26', 'p39', 'p66', 'p30', 'p78'],
 'p14': ['p78', 'p26', 'p39', 'p68', 'p82'],
 'p15': ['p51', 'p91', 'p76', 'p58', 'p45'],
 'p16': ['p90', 'p99', 'p29', 'p98', 'p54'],
 'p17': ['p94', 'p65', 'p59', 'p41', 'p84'],
 'p18': ['p25', 'p20', 'p93', 'p11', 'p8'],
 'p19': ['p2', 'p59', 'p84', 'p75', 'p94'],
 'p20': ['p25', 'p18', 'p11', 'p27', 'p93'],
 'p21': ['p81', 'p6', 'p55', 'p95', 'p71'],
 'p22': ['p67', 'p98', 'p29', 'p38', 'p100'],
 'p23': ['p52', 'p35', 'p

In [24]:
# Compute cosine similarity between every patient and every doctor using the final dataframes
similarity_matrix_final = cosine_similarity(patients_final_df, doctors_final_df)

# Create a DataFrame for the similarity matrix using the final dataframes
# Here, we'll use the indices as stand-in identifiers for patients and doctors
similarity_final_df = pd.DataFrame(similarity_matrix_final, index=patients_final_df.index, columns=doctors_final_df.index)

# Adjust the function to get top similar doctors using the final dataframe
def get_top_similar_doctors_final(similarity_df, top_n=3):
    top_similar_doctors_final = {}
    for patient_idx in similarity_df.index:
        # Get the top N similar doctors for this patient using index positions
        top_indexes = similarity_df.loc[patient_idx].nlargest(top_n).index
        # Collecting the top doctor indexes as identifiers
        top_similar_doctors_final[patient_idx] = top_indexes.tolist()
    return top_similar_doctors_final

In [25]:
# Identify missing columns in each dataframe and fill them with 0s
missing_columns_in_patients = set(doctors_final_df.columns) - set(patients_final_df.columns)
missing_columns_in_doctors = set(patients_final_df.columns) - set(doctors_final_df.columns)

# Add the missing columns to each dataframe, filled with 0
for column in missing_columns_in_patients:
    patients_final_df[column] = 0

for column in missing_columns_in_doctors:
    doctors_final_df[column] = 0

# Ensure the columns are in the same order for both dataframes
patients_final_df = patients_final_df.reindex(columns=doctors_final_df.columns)

# Recompute cosine similarity with aligned features
similarity_matrix_aligned = cosine_similarity(patients_final_df, doctors_final_df)

# Recreate the similarity DataFrame with aligned features
similarity_aligned_df = pd.DataFrame(similarity_matrix_aligned, index=patients_final_df.index, columns=doctors_final_df.index)

# Regenerate the top similar doctors per patient with aligned features
top_similar_doctors_per_patient_aligned = get_top_similar_doctors_final(similarity_aligned_df)

top_similar_doctors_per_patient_aligned


{0: [17, 3, 9],
 1: [17, 3, 9],
 2: [17, 3, 9],
 3: [17, 3, 9],
 4: [17, 3, 9],
 5: [17, 3, 9],
 6: [17, 3, 9],
 7: [17, 3, 9],
 8: [17, 3, 9],
 9: [17, 3, 9],
 10: [17, 3, 9],
 11: [17, 3, 9],
 12: [17, 3, 9],
 13: [17, 3, 9],
 14: [17, 3, 9],
 15: [17, 3, 9],
 16: [17, 3, 9],
 17: [17, 3, 9],
 18: [17, 3, 9],
 19: [17, 3, 9],
 20: [17, 3, 9],
 21: [17, 3, 9],
 22: [17, 3, 9],
 23: [17, 3, 9],
 24: [17, 3, 9],
 25: [17, 3, 9],
 26: [17, 3, 9],
 27: [17, 3, 9],
 28: [17, 3, 9],
 29: [17, 3, 9],
 30: [17, 3, 9],
 31: [17, 3, 9],
 32: [17, 3, 9],
 33: [17, 3, 9],
 34: [17, 3, 9],
 35: [17, 3, 9],
 36: [17, 3, 9],
 37: [17, 3, 9],
 38: [17, 3, 9],
 39: [17, 3, 9],
 40: [17, 3, 9],
 41: [17, 3, 9],
 42: [17, 3, 9],
 43: [17, 3, 9],
 44: [17, 3, 9],
 45: [17, 3, 9],
 46: [17, 3, 9],
 47: [17, 3, 9],
 48: [17, 3, 9],
 49: [17, 3, 9],
 50: [17, 3, 9],
 51: [17, 3, 9],
 52: [17, 3, 9],
 53: [17, 3, 9],
 54: [17, 3, 9],
 55: [17, 3, 9],
 56: [17, 3, 9],
 57: [17, 3, 9],
 58: [17, 3, 9],
 59: [1

In [26]:
# Identify missing columns in each dataframe and fill them with 0s
missing_columns_in_patients = set(doctors_final_df.columns) - set(patients_final_df.columns)
missing_columns_in_doctors = set(patients_final_df.columns) - set(doctors_final_df.columns)

# Add the missing columns to each dataframe, filled with 0
for column in missing_columns_in_patients:
    patients_final_df[column] = 0

for column in missing_columns_in_doctors:
    doctors_final_df[column] = 0

# Ensure the columns are in the same order for both dataframes
patients_final_df = patients_final_df.reindex(columns=doctors_final_df.columns)

# Recompute cosine similarity with aligned features
similarity_matrix_aligned = cosine_similarity(patients_final_df, doctors_final_df)

# Recreate the similarity DataFrame with aligned features
similarity_aligned_df = pd.DataFrame(similarity_matrix_aligned, index=patients_final_df.index, columns=doctors_final_df.index)

# Regenerate the top similar doctors per patient with aligned features
top_similar_doctors_per_patient_aligned = get_top_similar_doctors_final(similarity_aligned_df)

top_similar_doctors_per_patient_aligned

{0: [17, 3, 9],
 1: [17, 3, 9],
 2: [17, 3, 9],
 3: [17, 3, 9],
 4: [17, 3, 9],
 5: [17, 3, 9],
 6: [17, 3, 9],
 7: [17, 3, 9],
 8: [17, 3, 9],
 9: [17, 3, 9],
 10: [17, 3, 9],
 11: [17, 3, 9],
 12: [17, 3, 9],
 13: [17, 3, 9],
 14: [17, 3, 9],
 15: [17, 3, 9],
 16: [17, 3, 9],
 17: [17, 3, 9],
 18: [17, 3, 9],
 19: [17, 3, 9],
 20: [17, 3, 9],
 21: [17, 3, 9],
 22: [17, 3, 9],
 23: [17, 3, 9],
 24: [17, 3, 9],
 25: [17, 3, 9],
 26: [17, 3, 9],
 27: [17, 3, 9],
 28: [17, 3, 9],
 29: [17, 3, 9],
 30: [17, 3, 9],
 31: [17, 3, 9],
 32: [17, 3, 9],
 33: [17, 3, 9],
 34: [17, 3, 9],
 35: [17, 3, 9],
 36: [17, 3, 9],
 37: [17, 3, 9],
 38: [17, 3, 9],
 39: [17, 3, 9],
 40: [17, 3, 9],
 41: [17, 3, 9],
 42: [17, 3, 9],
 43: [17, 3, 9],
 44: [17, 3, 9],
 45: [17, 3, 9],
 46: [17, 3, 9],
 47: [17, 3, 9],
 48: [17, 3, 9],
 49: [17, 3, 9],
 50: [17, 3, 9],
 51: [17, 3, 9],
 52: [17, 3, 9],
 53: [17, 3, 9],
 54: [17, 3, 9],
 55: [17, 3, 9],
 56: [17, 3, 9],
 57: [17, 3, 9],
 58: [17, 3, 9],
 59: [1

In [29]:
# Reload the pre-processed dataframes
patients_final_df = pd.read_csv('./new_updated_patients.csv')
doctors_final_df = pd.read_csv('./new_updated_doctors.csv')

# Identify common features between the two dataframes
common_features = list(set(patients_final_df.columns) & set(doctors_final_df.columns))

# Filter both dataframes to only include common features for similarity computation
patients_final_df_filtered = patients_final_df[common_features]
doctors_final_df_filtered = doctors_final_df[common_features]

# Verify the filtered dataframes have the same dimensionality
assert patients_final_df_filtered.shape[1] == doctors_final_df_filtered.shape[1], "The number of features does not match."



In [30]:
# Define a function to compute cosine similarity and generate top N recommendations
def get_top_n_recommendations(patient_features, doctor_features, top_n=3):
    # Compute the cosine similarity
    similarity_matrix = cosine_similarity(patient_features, doctor_features)
    
    # Initialize a dictionary to hold the recommendations
    recommendations = {}
    
    # Iterate over each patient to find the top N similar doctors
    for idx, patient_similarities in enumerate(similarity_matrix):
        top_doctors_indices = patient_similarities.argsort()[-top_n:][::-1]  # Get indices of top N scores
        recommendations[idx] = top_doctors_indices.tolist()
    
    return recommendations

# Compute the top N recommendations using the common features
top_recommendations = get_top_n_recommendations(patients_final_df_filtered, doctors_final_df_filtered, top_n=3)

top_recommendations

{0: [6, 9, 17],
 1: [10, 1, 14],
 2: [18, 3, 7],
 3: [9, 6, 17],
 4: [18, 17, 10],
 5: [14, 9, 10],
 6: [5, 15, 9],
 7: [7, 8, 3],
 8: [9, 6, 17],
 9: [17, 11, 6],
 10: [9, 17, 12],
 11: [13, 1, 2],
 12: [8, 3, 10],
 13: [4, 3, 12],
 14: [4, 3, 12],
 15: [18, 3, 17],
 16: [12, 11, 2],
 17: [9, 6, 17],
 18: [1, 10, 3],
 19: [3, 18, 10],
 20: [2, 4, 12],
 21: [4, 3, 12],
 22: [17, 6, 9],
 23: [18, 3, 7],
 24: [12, 11, 9],
 25: [7, 3, 10],
 26: [7, 3, 10],
 27: [18, 7, 3],
 28: [12, 11, 2],
 29: [4, 3, 12],
 30: [7, 3, 10],
 31: [6, 17, 10],
 32: [7, 3, 10],
 33: [14, 13, 10],
 34: [13, 14, 9],
 35: [4, 3, 12],
 36: [16, 12, 11],
 37: [3, 10, 7],
 38: [9, 12, 11],
 39: [5, 9, 6],
 40: [18, 4, 3],
 41: [4, 2, 12],
 42: [9, 6, 17],
 43: [9, 6, 17],
 44: [7, 3, 8],
 45: [4, 16, 19],
 46: [8, 7, 3],
 47: [2, 10, 12],
 48: [7, 10, 14],
 49: [2, 4, 12],
 50: [12, 11, 3],
 51: [17, 9, 6],
 52: [11, 12, 6],
 53: [17, 18, 9],
 54: [14, 9, 2],
 55: [9, 6, 17],
 56: [9, 2, 12],
 57: [9, 17, 12],
 58

In [44]:
# Assuming all_similar_patients uses 'pX' format and needs to be converted to match the numeric indices used in top_recommendations
def convert_patient_ids(similar_patients_dict):
    return {int(k[1:])-1: [int(p[1:])-1 for p in v] for k, v in similar_patients_dict.items()}

numeric_similar_patients = convert_patient_ids(all_similar_patients)
numeric_similar_patients

{0: [1, 2], 1: [0, 2], 2: [0, 1]}

In [42]:
def aggregate_recommendations(similar_patients, top_recommendations):
    aggregated_recommendations = {}
    for patient_idx, similar_idxs in similar_patients.items():
        all_recs = []
        for sim_idx in similar_idxs:
            all_recs += top_recommendations.get(sim_idx, [])
        # Select unique recommendations, possibly rank them by frequency
        aggregated_recommendations[patient_idx] = list(set(all_recs))
    return aggregated_recommendations

aggregated_recommendations = aggregate_recommendations(numeric_similar_patients, top_recommendations)

In [43]:
aggregated_recommendations

{0: [], 1: [], 2: []}

In [45]:
# Assuming this function has already been defined and executed as per your setup
def aggregate_recommendations(similar_patients, top_recommendations):
    aggregated_recommendations = {}
    for patient_idx, similar_idxs in similar_patients.items():
        all_recs = []
        for sim_idx in similar_idxs:
            all_recs += top_recommendations.get(sim_idx, [])
        # Deduplicate recommendations by converting to a set and back to a list
        aggregated_recommendations[patient_idx] = list(set(all_recs))
    return aggregated_recommendations

# Execute aggregation based on the numeric similarity mapping
aggregated_recommendations = aggregate_recommendations(numeric_similar_patients, top_recommendations)

# This should now reflect the correctly aggregated recommendations for each patient
print(aggregated_recommendations)

{0: [], 1: [], 2: []}


In [46]:
# Make sure this conversion aligns with how patients are indexed in top_recommendations
def convert_patient_ids(similar_patients_dict):
    # Adjust this logic if necessary to align with your indexing
    return {int(k[1:])-1: [int(p[1:])-1 for p in v] for k, v in similar_patients_dict.items()}

# Assuming all_similar_patients is correctly populated
numeric_similar_patients = convert_patient_ids(all_similar_patients)
numeric_similar_patients

{0: [1, 2], 1: [0, 2], 2: [0, 1]}

In [32]:
def aggregate_recommendations(similar_patients, top_recommendations):
    aggregated_recommendations = {}
    for patient, similar in similar_patients.items():
        # Aggregate recommendations for each similar patient group
        all_recommendations = []
        for sim_patient in similar:
            all_recommendations.extend(top_recommendations.get(sim_patient, []))
        # Select the most frequent recommendations
        most_common = pd.Series(all_recommendations).value_counts().index[:3].tolist()
        aggregated_recommendations[patient] = most_common
    return aggregated_recommendations

# Assuming `all_similar_patients` holds the similar patient info and `top_recommendations` holds doctor recommendations
aggregated_recommendations = aggregate_recommendations(all_similar_patients, top_recommendations)
aggregated_recommendations


  most_common = pd.Series(all_recommendations).value_counts().index[:3].tolist()
  most_common = pd.Series(all_recommendations).value_counts().index[:3].tolist()
  most_common = pd.Series(all_recommendations).value_counts().index[:3].tolist()
  most_common = pd.Series(all_recommendations).value_counts().index[:3].tolist()
  most_common = pd.Series(all_recommendations).value_counts().index[:3].tolist()
  most_common = pd.Series(all_recommendations).value_counts().index[:3].tolist()
  most_common = pd.Series(all_recommendations).value_counts().index[:3].tolist()
  most_common = pd.Series(all_recommendations).value_counts().index[:3].tolist()
  most_common = pd.Series(all_recommendations).value_counts().index[:3].tolist()
  most_common = pd.Series(all_recommendations).value_counts().index[:3].tolist()
  most_common = pd.Series(all_recommendations).value_counts().index[:3].tolist()
  most_common = pd.Series(all_recommendations).value_counts().index[:3].tolist()
  most_common = pd.Series(al

{'p1': [],
 'p2': [],
 'p3': [],
 'p4': [],
 'p5': [],
 'p6': [],
 'p7': [],
 'p8': [],
 'p9': [],
 'p10': [],
 'p11': [],
 'p12': [],
 'p13': [],
 'p14': [],
 'p15': [],
 'p16': [],
 'p17': [],
 'p18': [],
 'p19': [],
 'p20': [],
 'p21': [],
 'p22': [],
 'p23': [],
 'p24': [],
 'p25': [],
 'p26': [],
 'p27': [],
 'p28': [],
 'p29': [],
 'p30': [],
 'p31': [],
 'p32': [],
 'p33': [],
 'p34': [],
 'p35': [],
 'p36': [],
 'p37': [],
 'p38': [],
 'p39': [],
 'p40': [],
 'p41': [],
 'p42': [],
 'p43': [],
 'p44': [],
 'p45': [],
 'p46': [],
 'p47': [],
 'p48': [],
 'p49': [],
 'p50': [],
 'p51': [],
 'p52': [],
 'p53': [],
 'p54': [],
 'p55': [],
 'p56': [],
 'p57': [],
 'p58': [],
 'p59': [],
 'p60': [],
 'p61': [],
 'p62': [],
 'p63': [],
 'p64': [],
 'p65': [],
 'p66': [],
 'p67': [],
 'p68': [],
 'p69': [],
 'p70': [],
 'p71': [],
 'p72': [],
 'p73': [],
 'p74': [],
 'p75': [],
 'p76': [],
 'p77': [],
 'p78': [],
 'p79': [],
 'p80': [],
 'p81': [],
 'p82': [],
 'p83': [],
 'p84': [],
 

In [37]:
numeric_similar_patients = {int(pid[1:]) - 1: [int(sim[1:]) - 1 for sim in sims] for pid, sims in all_similar_patients.items()}


In [38]:
def aggregate_recommendations(similar_patients, top_recommendations):
    aggregated_recommendations = {}
    for patient_idx, similar_idxs in similar_patients.items():
        all_recs = []
        for sim_idx in similar_idxs:
            all_recs.extend(top_recommendations.get(sim_idx, []))
        # Here, you might want to aggregate differently, e.g., taking the most common recommendations.
        # For simplicity, we're just collecting all recommendations and then taking a set to remove duplicates.
        aggregated_recommendations[patient_idx] = list(set(all_recs))
    return aggregated_recommendations

aggregated_recommendations = aggregate_recommendations(numeric_similar_patients, top_recommendations)

aggregated_recommendations

{0: [], 1: [], 2: []}

In [39]:
# Correcting the assumption that 'p1' should map to index 0, 'p2' to 1, and so on.
numeric_similar_patients_corrected = {int(pid[1:])-1: [int(sim[1:])-1 for sim in sims] for pid, sims in all_similar_patients.items()}


In [40]:
# Simplified test case with known-good values
numeric_similar_patients_test = {0: [1, 2], 1: [0, 2], 2: [0, 1]}
top_recommendations_test = {0: [6, 9, 17], 1: [10, 1, 14], 2: [18, 3, 7]}

# Test the aggregation with this simplified data
aggregated_recommendations_test = aggregate_recommendations(numeric_similar_patients_test, top_recommendations_test)
print(aggregated_recommendations_test)


{0: [1, 3, 7, 10, 14, 18], 1: [3, 6, 7, 9, 17, 18], 2: [1, 6, 9, 10, 14, 17]}


In [33]:
def aggregate_recommendations(similar_patients, top_recommendations):
    aggregated_recommendations = {}
    for patient_id, similar_ids in similar_patients.items():
        # Initialize a list to collect all recommendations for similar patients
        all_recs = []
        for sim_id in similar_ids:
            # Ensure that similar patient IDs are in the format expected by top_recommendations
            # Add the recommendations for each similar patient to the list
            all_recs.extend(top_recommendations.get(sim_id, []))
        # Deduplicate and select the top N recommendations
        top_recs = list(set(all_recs))[:3]  # Adjust as needed for your logic
        aggregated_recommendations[patient_id] = top_recs
    return aggregated_recommendations

# Example usage
aggregated_recommendations = aggregate_recommendations(all_similar_patients, top_recommendations)
aggregated_recommendations

{'p1': [],
 'p2': [],
 'p3': [],
 'p4': [],
 'p5': [],
 'p6': [],
 'p7': [],
 'p8': [],
 'p9': [],
 'p10': [],
 'p11': [],
 'p12': [],
 'p13': [],
 'p14': [],
 'p15': [],
 'p16': [],
 'p17': [],
 'p18': [],
 'p19': [],
 'p20': [],
 'p21': [],
 'p22': [],
 'p23': [],
 'p24': [],
 'p25': [],
 'p26': [],
 'p27': [],
 'p28': [],
 'p29': [],
 'p30': [],
 'p31': [],
 'p32': [],
 'p33': [],
 'p34': [],
 'p35': [],
 'p36': [],
 'p37': [],
 'p38': [],
 'p39': [],
 'p40': [],
 'p41': [],
 'p42': [],
 'p43': [],
 'p44': [],
 'p45': [],
 'p46': [],
 'p47': [],
 'p48': [],
 'p49': [],
 'p50': [],
 'p51': [],
 'p52': [],
 'p53': [],
 'p54': [],
 'p55': [],
 'p56': [],
 'p57': [],
 'p58': [],
 'p59': [],
 'p60': [],
 'p61': [],
 'p62': [],
 'p63': [],
 'p64': [],
 'p65': [],
 'p66': [],
 'p67': [],
 'p68': [],
 'p69': [],
 'p70': [],
 'p71': [],
 'p72': [],
 'p73': [],
 'p74': [],
 'p75': [],
 'p76': [],
 'p77': [],
 'p78': [],
 'p79': [],
 'p80': [],
 'p81': [],
 'p82': [],
 'p83': [],
 'p84': [],
 

In [35]:
all_similar_patients

{'p1': ['p31', 'p34', 'p48', 'p6', 'p12'],
 'p2': ['p84', 'p19', 'p94', 'p75', 'p59'],
 'p3': ['p32', 'p12', 'p48', 'p34', 'p1'],
 'p4': ['p43', 'p100', 'p49', 'p24', 'p80'],
 'p5': ['p10', 'p85', 'p83', 'p2', 'p41'],
 'p6': ['p55', 'p81', 'p21', 'p95', 'p31'],
 'p7': ['p75', 'p41', 'p53', 'p65', 'p59'],
 'p8': ['p18', 'p63', 'p25', 'p93', 'p20'],
 'p9': ['p88', 'p54', 'p99', 'p90', 'p16'],
 'p10': ['p75', 'p53', 'p41', 'p84', 'p59'],
 'p11': ['p27', 'p20', 'p28', 'p25', 'p18'],
 'p12': ['p32', 'p48', 'p3', 'p34', 'p1'],
 'p13': ['p26', 'p39', 'p66', 'p30', 'p78'],
 'p14': ['p78', 'p26', 'p39', 'p68', 'p82'],
 'p15': ['p51', 'p91', 'p76', 'p58', 'p45'],
 'p16': ['p90', 'p99', 'p29', 'p98', 'p54'],
 'p17': ['p94', 'p65', 'p59', 'p41', 'p84'],
 'p18': ['p25', 'p20', 'p93', 'p11', 'p8'],
 'p19': ['p2', 'p59', 'p84', 'p75', 'p94'],
 'p20': ['p25', 'p18', 'p11', 'p27', 'p93'],
 'p21': ['p81', 'p6', 'p55', 'p95', 'p71'],
 'p22': ['p67', 'p98', 'p29', 'p38', 'p100'],
 'p23': ['p52', 'p35', 'p

In [36]:
# Mock data for testing
all_similar_patients = {
    'p1': ['p2', 'p3'],
    'p2': ['p1', 'p3'],
    'p3': ['p1', 'p2'],
}

top_recommendations = {
    'p1': [101, 102, 103],
    'p2': [102, 103, 104],
    'p3': [101, 104, 105],
}

# Use the aggregate_recommendations function with this mock data
aggregated_recommendations_test = aggregate_recommendations(all_similar_patients, top_recommendations)
print(aggregated_recommendations_test)


{'p1': [101, 102, 103], 'p2': [101, 102, 103], 'p3': [104, 101, 102]}
