In [9]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets
patients_df = pd.read_csv('../data/patients_data (1).csv')
doctors_df = pd.read_csv('../data/doctors-data.csv')

In [15]:
def preprocess_align_features(patients_df, doctors_df):
    # Combine languages for doctors and handle NaNs
    doctors_df['language_combined'] = doctors_df['language_1'].fillna('') + ',' + doctors_df['language_2'].fillna('')
    
    # Initialize OneHotEncoder
    ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
    
    # Encode 'sex' and 'location' using LabelEncoder for simplicity
    for column in ['sex', 'location']:
        le = LabelEncoder()
        combined = pd.concat([patients_df[column].fillna('Unknown'), doctors_df[column].fillna('Unknown')])
        le.fit(combined)
        patients_df[column + '_encoded'] = le.transform(patients_df[column].fillna('Unknown'))
        doctors_df[column + '_encoded'] = le.transform(doctors_df[column].fillna('Unknown'))
    
    # Convert 'budget_max' and 'cost_max' to numeric, ensuring columns exist and are not null before attempting string operations
    if 'budget_max' in patients_df.columns and patients_df['budget_max'].dtype == object:
        patients_df['budget_max'] = pd.to_numeric(patients_df['budget_max'].str.replace('[\$,]', '', regex=True), errors='coerce').fillna(0)
    
    if 'cost_max' in doctors_df.columns and doctors_df['cost_max'].dtype == object:
        doctors_df['cost_max'] = pd.to_numeric(doctors_df['cost_max'].str.replace('[\$,]', '', regex=True), errors='coerce').fillna(0)
    
    # Handle insurance by creating a simplified matching column for each patient's insurance in the doctors DataFrame
    unique_insurances = patients_df['Insurance_plan'].dropna().unique()
    for insurance in unique_insurances:
        insurance_column = 'accepts_' + insurance.replace(' ', '_').replace('/', '_')
        doctors_df[insurance_column] = doctors_df.apply(lambda x: 1 if insurance in x.values else 0, axis=1)
    
    return patients_df, doctors_df

patients_df_aligned, doctors_df_aligned = preprocess_align_features(patients_df, doctors_df)


In [16]:
patients_df_aligned


Unnamed: 0,patient_id,first_name,last_name,date_of_birth,location,language_preference,Insurance_plan,sex,ethnicity,doctor's_gender_preference,location_radious_prefernce,consultation_location,problem,budget_max,communication_preference,sex_encoded,location_encoded
0,p1,Mark,Kirby,2/3/2003,Brookline,,Humana,F,Hispanic,No Preference,30 miles,virtual,,300.0,less conversational,0,3
1,p2,Lauren,Smith,8/19/1941,Back Bay,,MassHealth,M,,F,30 miles,No Preference,fever,,more conversational,1,0
2,p3,Eric,Reynolds,12/1/2004,Cambridge,Mandarin,Humana,M,Hispanic,No Preference,50 miles,No Preference,headache,500.0,less conversational,1,4
3,p4,Mark,Payne,5/28/1951,Roxbury,Russian,Blue Cross Blue Shield,F,Latino,F,30 miles,virtual,cough,200.0,,0,8
4,p5,Bobby,Martinez,10/18/2005,Charlestown,Russian,MassHealth,M,White,M,No Preference,In-person,fatigue,,more conversational,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,p96,Elizabeth,Martinez,6/25/1989,Brighton,Mandarin,Cigna,M,White,M,No Preference,In-person,back pain,100.0,less conversational,1,2
96,p97,Daniel,Flores,5/10/1989,South End,Arabic,MassHealth,Other,Arab,F,50 miles,No Preference,,,more conversational,2,10
97,p98,Katherine,Phillips,12/25/1976,Charlestown,French,Humana,Other,Arab,M,50 miles,In-person,back pain,100.0,,2,5
98,p99,Brandi,Herrera,5/28/1959,Roxbury,Mandarin,Blue Cross Blue Shield,F,Hispanic,M,No Preference,In-person,skin rash,100.0,more conversational,0,8


In [17]:
doctors_df_aligned 

Unnamed: 0,doctor_id,first_name,last_name,date_of_birth,location,language_1,language_2,sex,ethnicity,availability,...,language,language_combined,sex_encoded,location_encoded,accepts_Humana,accepts_MassHealth,accepts_Blue_Cross_Blue_Shield,accepts_Aetna,accepts_Cigna,accepts_UnitedHealthcare
0,d1,Crystal,Anthony,9/4/1983,Dorchester,English,Russian,F,Middle Eastern,evening,...,"English, Russian","English,Russian",0,6,0,0,0,0,0,0
1,d2,Johnny,Gray,1/7/1975,South End,English,Spanish,M,,evening,...,"English, Spanish","English,Spanish",1,10,0,0,0,0,0,0
2,d3,Mark,Short,1/15/1987,West End,English,French,Other,Asian,afternoon,...,"English, French","English,French",2,11,0,0,0,0,0,0
3,d4,Roy,Burgess,10/17/1966,Dorchester,English,French,M,Hispanic,evening,...,"English, French","English,French",1,6,0,0,0,0,0,0
4,d5,Richard,Watson,2/25/1976,South End,English,French,Other,Hispanic,afternoon,...,"English, French","English,French",2,10,0,0,0,0,0,0
5,d6,Jacob,Smith,1/13/1973,Dorchester,English,Spanish,F,Asian,morning,...,"English, Spanish","English,Spanish",0,6,0,0,0,0,0,0
6,d7,Amy,Miller,3/31/1973,Fenway,English,French,F,White,evening,...,"English, French","English,French",0,7,0,0,0,0,0,0
7,d8,Christopher,Woodard,10/22/1986,Fenway,English,Mandarin,M,Black,afternoon,...,"English, Mandarin","English,Mandarin",1,7,0,0,0,0,0,0
8,d9,Emily,Williamson,9/11/1990,Roxbury,English,Portuguese,M,Black,morning,...,"English, Portuguese","English,Portuguese",1,8,0,0,0,0,0,0
9,d10,Jessica,Knox,5/26/1969,Fenway,English,Portuguese,F,Middle Eastern,afternoon,...,"English, Portuguese","English,Portuguese",0,7,0,0,0,0,0,0


In [18]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [30]:
scaler = MinMaxScaler()

In [31]:
patients_df['budget_max_scaled'] = scaler.fit_transform(patients_df[['budget_max']].fillna(0).to_numpy().reshape(-1, 1))

In [32]:
patient_features = np.hstack([
    patients_df[['sex_encoded', 'location_encoded']].to_numpy(),
    patients_df[['budget_max_scaled']].to_numpy()  # Now this column exists
])

In [33]:
patient_similarity = cosine_similarity(patient_features)

In [34]:
def recommend_doctors(patient_index, top_n=5):
    # Find top N similar patients
    similarity_scores = patient_similarity[patient_index]
    top_patient_indices = np.argsort(similarity_scores)[-top_n-1:-1][::-1]  # Exclude self

    # Aggregate preferences (e.g., average budget) from similar patients
    avg_budget = np.mean(patients_df.loc[top_patient_indices, 'budget_max'])
    
    # Filter doctors based on aggregated preferences and compute similarity
    suitable_doctors = doctors_df[doctors_df['cost_max'] <= avg_budget]
    
    # Create feature vectors for suitable doctors
    doctor_features = suitable_doctors[['sex_encoded', 'location_encoded', 'cost_max_scaled']].to_numpy()
    
    # Compute cosine similarity between aggregated patient preferences and suitable doctors
    doctor_similarity = cosine_similarity([patient_features[patient_index]], doctor_features)
    
    # Recommend top N doctors based on similarity scores
    top_doctor_indices = np.argsort(doctor_similarity[0])[-top_n:][::-1]
    return suitable_doctors.iloc[top_doctor_indices]

In [35]:
patient_index = 0  # Assuming you want recommendations for the first patient
recommended_doctors = recommend_doctors(patient_index, top_n=3)
print(recommended_doctors)

KeyError: "['cost_max_scaled'] not in index"

In [36]:
# Scale the 'cost_max' column in doctors_df
doctors_df['cost_max_scaled'] = scaler.fit_transform(doctors_df[['cost_max']].fillna(0).to_numpy().reshape(-1, 1))


In [37]:
def recommend_doctors(patient_index, top_n=5):
    # Assuming the similarity computation and aggregation of similar patients' preferences have been done correctly
    
    # For simplicity, let's consider an average budget based on similar patients (implementation details depend on your similarity computation)
    avg_budget = patients_df_aligned.loc[patient_index, 'budget_max_scaled']
    
    # Filter doctors based on the average budget (scaled) of similar patients
    suitable_doctors = doctors_df_aligned[doctors_df_aligned['cost_max_scaled'] <= avg_budget]
    
    # Assuming we have already computed doctor_features correctly with scaled cost_max
    # Compute similarity between the patient preferences and doctors
    doctor_similarity = cosine_similarity(patient_features[patient_index].reshape(1, -1), suitable_doctors[['sex_encoded', 'location_encoded', 'cost_max_scaled']].to_numpy())
    
    # Find top N indices of doctors based on similarity scores
    top_doctor_indices = np.argsort(-doctor_similarity[0])[:top_n]
    recommended_doctors = suitable_doctors.iloc[top_doctor_indices]
    
    return recommended_doctors

# Now, you can call this function to get recommendations for a given patient index
patient_index = 0  # Assuming you want recommendations for the first patient
recommended_doctors = recommend_doctors(patient_index, top_n=3)
print(recommended_doctors[['first_name', 'last_name', 'speciality', 'location', 'cost_max']])


   first_name last_name   speciality     location  cost_max
15      Brian      Kirk   cardiology     Brighton     300.0
9     Jessica      Knox    neurology       Fenway     200.0
17     Nicole   Johnson  dermatology  Charlestown     200.0


In [25]:
# If 'budget_max' or 'cost_max' still contain NaNs, fill them with 0 or mean value
patients_df['budget_max'].fillna(0, inplace=True)
doctors_df['cost_max'].fillna(doctors_df['cost_max'].mean(), inplace=True)

In [26]:
patient_features = patients_df_aligned[['sex_encoded', 'location_encoded', 'budget_max']].to_numpy()
doctor_features = doctors_df_aligned[['sex_encoded', 'location_encoded', 'cost_max']].to_numpy()


In [27]:
scaler = MinMaxScaler()
patient_features[:, -1] = scaler.fit_transform(patient_features[:, -1].reshape(-1, 1)).flatten()  # Budget
doctor_features[:, -1] = scaler.transform(doctor_features[:, -1].reshape(-1, 1)).flatten()  # Cost


In [29]:
patient_features = np.hstack([
    patients_df[['sex_encoded', 'location_encoded']].to_numpy(),
    patients_df[['budget_max_scaled']].to_numpy()
])

patient_similarity = cosine_similarity(patient_features)

KeyError: "None of [Index(['budget_max_scaled'], dtype='object')] are in the [columns]"

In [11]:
from sklearn.preprocessing import OneHotEncoder

# Align and preprocess features
def align_features(patients_df, doctors_df):

    # Correctly handle 'NaN' values for language concatenation
    doctors_df['language'] = doctors_df.apply(lambda x: (str(x['language_1']) if not pd.isna(x['language_1']) else '') + ', ' + (str(x['language_2']) if not pd.isna(x['language_2']) else ''), axis=1)
    
    # One-hot encode 'language_preference' for patients and 'language' for doctors
    ohe_patients = OneHotEncoder(handle_unknown='ignore')
    ohe_doctors = OneHotEncoder(handle_unknown='ignore')
    
    patients_language_encoded = ohe_patients.fit_transform(patients_df[['language_preference']].fillna('Unknown')).toarray()
    doctors_language_encoded = ohe_doctors.fit_transform(doctors_df[['language']]).toarray()
    
    # Ensure insurance columns align
    insurance_columns = [col for col in doctors_df.columns if col.startswith('Insurance_')]
    patients_df['insurance_encoded'] = patients_df['Insurance_plan'].apply(lambda x: ','.join([col for col in insurance_columns if doctors_df[col].any() and col.endswith(x)]))
    
    # Encode other comparable features
    encoder_sex = LabelEncoder().fit(pd.concat([patients_df['sex'], doctors_df['sex']], axis=0))
    patients_df['sex_encoded'] = encoder_sex.transform(patients_df['sex'])
    doctors_df['sex_encoded'] = encoder_sex.transform(doctors_df['sex'])
    
    encoder_location = LabelEncoder().fit(pd.concat([patients_df['location'], doctors_df['location']], axis=0))
    patients_df['location_encoded'] = encoder_location.transform(patients_df['location'])
    doctors_df['location_encoded'] = encoder_location.transform(doctors_df['location'])
    
    encoder_ethnicity = LabelEncoder().fit(pd.concat([patients_df['ethnicity'].fillna('Unknown'), doctors_df['ethnicity'].fillna('Unknown')], axis=0))
    patients_df['ethnicity_encoded'] = encoder_ethnicity.transform(patients_df['ethnicity'].fillna('Unknown'))
    doctors_df['ethnicity_encoded'] = encoder_ethnicity.transform(doctors_df['ethnicity'].fillna('Unknown'))
    
    # Convert budget_max and cost_max to numeric values
    patients_df['budget_max'] = pd.to_numeric(patients_df['budget_max'].str.replace('[\$,]', '', regex=True), errors='coerce')
    doctors_df['cost_max'] = pd.to_numeric(doctors_df['cost_max'].str.replace('[\$,]', '', regex=True), errors='coerce')
    
    return patients_df, doctors_df, patients_language_encoded, doctors_language_encoded, ohe_patients, ohe_doctors

patients_df_aligned, doctors_df_aligned, patients_language_encoded, doctors_language_encoded, ohe_patients, ohe_doctors = align_features(patients_df, doctors_df)

# Function to compute similarities and recommend doctors will be adjusted to use these aligned and encoded features.


TypeError: endswith first arg must be str or a tuple of str, not float

In [4]:
doctors_df['hospital_affiliation'] = doctors_df[['hospital_affiliation1', 'hospital_affiliation2', 'hospital_affiliation3']].apply(lambda x: x.dropna().iloc[0] if x.dropna().any() else None, axis=1)

In [5]:
# Function for preprocessing and encoding features
def preprocess_and_encode(df, categorical_features):
    le_dict = {}
    for feature in categorical_features:
        le = LabelEncoder()
        df[feature] = le.fit_transform(df[feature].fillna('Unknown'))
        le_dict[feature] = le
    return df, le_dict

In [7]:
# Define categorical features for encoding
patient_categorical_features = ['language_preference', 'sex', 'ethnicity', 'insurance', 'location', 'problem', 'communication_preference']
doctor_categorical_features = ['language_2', 'sex', 'ethnicity', 'speciality', 'hospital_affiliation', 'location', 'insurance']

patients_df, patient_le_dict = preprocess_and_encode(patients_df, patient_categorical_features)
doctors_df, doctor_le_dict = preprocess_and_encode(doctors_df, doctor_categorical_features)


KeyError: 'insurance'

In [8]:

# Preprocessing: Align features and encode categorical variables
def preprocess_data(patients_df, doctors_df):
    common_features = ['language', 'sex', 'ethnicity', 'location', 'insurance']
    
    # Handle Insurance as a special case since it's directly comparable but needs encoding
    for feature in common_features:
        if feature != 'insurance':  # Insurance will be binary encoded later
            le = LabelEncoder()
            combined_data = pd.concat([patients_df[feature], doctors_df[feature]], ignore_index=True, axis=0).fillna('Unknown')
            le.fit(combined_data)
            patients_df[f'{feature}_encoded'] = le.transform(patients_df[feature].fillna('Unknown'))
            doctors_df[f'{feature}_encoded'] = le.transform(doctors_df[feature].fillna('Unknown'))
    
    # Binary encode 'insurance' to indicate if a doctor accepts a patient's insurance
    patients_insurance = patients_df['insurance'].unique()
    for insurance in patients_insurance:
        doctors_df[f'accepts_{insurance}'] = doctors_df['insurance'].apply(lambda x: insurance in x)
    
    # Simplify and encode other features
    patients_df['budget_max'] = pd.to_numeric(patients_df['budget_max'].str.replace('[\$,]', '', regex=True))
    doctors_df['cost_max'] = pd.to_numeric(doctors_df['cost_max'].str.replace('[\$,]', '', regex=True))

    return patients_df, doctors_df

patients_df, doctors_df = preprocess_data(patients_df, doctors_df)

KeyError: 'language'