In [17]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans


patients_df = pd.read_csv('../data/patients_data.csv')
doctors_df = pd.read_csv('../data/diverse_doctors_data.csv')


In [18]:
doctors_df['cost_min'] = doctors_df['cost_min'].replace('[\$,]', '', regex=True).astype(float)
doctors_df['cost_max'] = doctors_df['cost_max'].replace('[\$,]', '', regex=True).astype(float)
patients_df['budget_max'] = patients_df['budget_max'].replace('[\$,]', '', regex=True).astype(float)


In [19]:
# Example: Fill missing numeric values with the median
patients_df['budget_max'].fillna(patients_df['budget_max'].median(), inplace=True)

# Example: Fill missing categorical values with a placeholder
patients_df['language_preference'].fillna('Unknown', inplace=True)


In [20]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Label Encoding for binary or ordinal categorical variables
patients_df['consultation_location'] = le.fit_transform(patients_df['consultation_location'].astype(str))

# One-Hot Encoding for nominal categorical variables
patients_df = pd.get_dummies(patients_df, columns=['Insurance_plan'], prefix='Insurance')


In [21]:
patients_df['date_of_birth'] = pd.to_datetime(patients_df['date_of_birth'])
doctors_df['date_of_birth'] = pd.to_datetime(doctors_df['date_of_birth'])

# Example: Calculate age (assuming the dataset is current as of 2023)
patients_df['age'] = 2023 - patients_df['date_of_birth'].dt.year


In [22]:
print(patients_df.info())
print(doctors_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 21 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   patient_id                        100 non-null    object        
 1   first_name                        100 non-null    object        
 2   last_name                         100 non-null    object        
 3   date_of_birth                     100 non-null    datetime64[ns]
 4   location                          100 non-null    object        
 5   language_preference               100 non-null    object        
 6   sex                               100 non-null    object        
 7   ethnicity                         91 non-null     object        
 8   doctor's_gender_preference        100 non-null    object        
 9   location_radious_prefernce        100 non-null    object        
 10  consultation_location             100 non-null    i

In [23]:
patients = patients_df  # Use the variable from previous steps
doctors = doctors_df    # Use the variable from previous steps

In [24]:
le = LabelEncoder()
for column in patients.columns:
    if patients[column].dtype == 'object':
        patients[column] = le.fit_transform(patients[column].astype(str))
for column in doctors.columns:
    if doctors[column].dtype == 'object':
        doctors[column] = le.fit_transform(doctors[column].astype(str))

In [25]:
scaler = StandardScaler()

# Select numerical features for scaling
patient_numerical_features = patients.select_dtypes(include=['float64', 'int64'])
doctor_numerical_features = doctors.select_dtypes(include=['float64', 'int64'])

# Fit on patient data and transform both patient and doctor data
patients_scaled = scaler.fit_transform(patient_numerical_features)
doctors_scaled = scaler.transform(doctor_numerical_features)


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- cost_max
- cost_min
- review
Feature names seen at fit time, yet now missing:
- age
- budget_max


In [8]:
patient_similarity = cosine_similarity(patients_scaled)


In [9]:
# Choose an appropriate number of clusters
num_clusters = 5  # Example value; adjust based on your analysis
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(patient_similarity)

# Assign cluster labels to patients
patients['cluster'] = kmeans.labels_


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- cost_max
- cost_min
- review
Feature names seen at fit time, yet now missing:
- age
- budget_max


In [None]:
def recommend_doctors_for_cluster(cluster_id):
    # Find the most common location within the patient cluster
    common_location = patients[patients['cluster'] == cluster_id]['location'].mode().iloc[0]
    
    # Filter doctors based on this location
    recommended_doctors = doctors[doctors['location'] == common_location]
    
    # Further processing could include matching other features and ranking doctors
    return recommended_doctors

# Example: Get recommendations for the first cluster
recommendations_for_first_cluster = recommend_doctors_for_cluster(0)
print(recommendations_for_first_cluster.head())
