In [1]:
!pip install --upgrade scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Using cached scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.4.2
    Uninstalling scikit-learn-1.4.2:
      Successfully uninstalled scikit-learn-1.4.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sktime 0.26.0 requires scikit-learn<1.5.0,>=0.24, but you have scikit-learn 1.6.1 which is incompatible.
category-encoders 2.7.0 requires scikit-learn<1.6.0,>=1.0.0, but you have scikit-learn 1.6.1 which is incompatible.[0m[31m
[0mSuccessfully installed scikit-learn-1.6.1


In [2]:
!pip install pythainlp



In [3]:
!pip install pycaret

Collecting scikit-learn>1.4.0 (from pycaret)
  Using cached scikit_learn-1.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Using cached scikit_learn-1.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
Successfully installed scikit-learn-1.4.2


In [4]:
import pandas as pd
import numpy as np
from pythainlp.tokenize import word_tokenize
from pythainlp.corpus.common import thai_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pycaret.classification import *

In [5]:
Symptoms = pd.read_csv('/content/drive/MyDrive/[CONFIDENTIAL] AI symptom picker data (Agnos candidate assignment) - ai_symptom_picker.csv')

In [6]:
Symptoms.head()

Unnamed: 0,gender,age,summary,search_term
0,male,28,"{""diseases"": [], ""procedures"": [], ""no_symptom...","มีเสมหะ, ไอ"
1,male,27,"{""diseases"": [], ""procedures"": [], ""no_symptom...","ไอ, น้ำมูกไหล"
2,female,26,"{""diseases"": [], ""procedures"": [], ""no_symptom...",ปวดท้อง
3,male,42,"{""diseases"": [], ""procedures"": [], ""no_symptom...",น้ำมูกไหล
4,female,40,"{""diseases"": [], ""procedures"": [], ""no_symptom...",ตาแห้ง


# **Extract Text**

In [7]:
def extract_answers(summary):
    """
    Extracts the 'answers' values from the 'yes_symptoms' list in a JSON-like string.
    """
    try:
        data = eval(summary)  # Evaluate the string as a Python dictionary
        if isinstance(data, dict) and 'yes_symptoms' in data:
            answers = []
            for symptom in data['yes_symptoms']:
                if 'answers' in symptom:
                    answers.extend(symptom['answers'])
            return ', '.join(answers)  # Join the answers with commas
        else:
            return None  # Return None if the structure is not as expected
    except (SyntaxError, TypeError, KeyError):
        return None  # Return None if there's an error parsing or accessing the data

In [8]:
def process_dataframe(df, summary_col='summary', symptoms_note_col='symptoms_note'):
    """
    Processes a DataFrame to extract 'answers' and create a new 'symptoms_note' column.
    """
    df[symptoms_note_col] = df[summary_col].apply(extract_answers)
    return df

In [9]:
# Process the DataFrame
Symptoms = process_dataframe(Symptoms)

In [10]:
Symptoms.head()

Unnamed: 0,gender,age,summary,search_term,symptoms_note
0,male,28,"{""diseases"": [], ""procedures"": [], ""no_symptom...","มีเสมหะ, ไอ","ลักษณะ เสมหะเปลี่ยนสีเหลือง/เขียว, ระยะเวลา ไม..."
1,male,27,"{""diseases"": [], ""procedures"": [], ""no_symptom...","ไอ, น้ำมูกไหล","ระยะเวลา 1-3 สัปดาห์, ลักษณะ ไอไม่มีเสมหะ ไอแห..."
2,female,26,"{""diseases"": [], ""procedures"": [], ""no_symptom...",ปวดท้อง,"บริเวณ รอบๆสะดือ, ระยะเวลา ตั้งแต่ 1 วัน ถึง 1..."
3,male,42,"{""diseases"": [], ""procedures"": [], ""no_symptom...",น้ำมูกไหล,"ระยะเวลา น้อยกว่า 10 วัน, ประวัติ ATK ยังไม่ได..."
4,female,40,"{""diseases"": [], ""procedures"": [], ""no_symptom...",ตาแห้ง,การรักษาก่อนหน้า ไม่เคย


In [11]:
Symptoms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   gender         1000 non-null   object
 1   age            1000 non-null   int64 
 2   summary        1000 non-null   object
 3   search_term    1000 non-null   object
 4   symptoms_note  1000 non-null   object
dtypes: int64(1), object(4)
memory usage: 39.2+ KB


# **Hybrid Recommendation** consists of Content based Filtering Recommender System and Knowledge Filtering

# **Feature Selection**

In [12]:
# Select input features
Symptoms['search_term']   = Symptoms['search_term'].astype(str)
Symptoms['symptoms_note'] = Symptoms['symptoms_note'].astype(str)

In [13]:
# Tokenize and remove stopwords
thai_stopwords = list(thai_stopwords())

In [14]:
def preprocess_text(text):
    words = word_tokenize(text, engine="newmm")  # Tokenize using newmm engine
    words = [word for word in words if word not in thai_stopwords]  # Remove stopwords
    return " ".join(words)

In [15]:
Symptoms['symptoms_note'] = Symptoms['symptoms_note'].apply(preprocess_text)

In [16]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(tokenizer=word_tokenize, max_features=5000)
tfidf_matrix = vectorizer.fit_transform(Symptoms['symptoms_note'])

In [17]:
# Convert TF-IDF to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=Symptoms.index)

# **Content-Based Filtering**

In [18]:
# Compute similarity
cosine_sim = cosine_similarity(tfidf_matrix)

# Function to get symptom recommendations based on search term
def get_similar_symptoms(search_index, top_n=5):
    sim_scores = list(enumerate(cosine_sim[search_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    symptom_indices = [i[0] for i in sim_scores]
    return Symptoms.iloc[symptom_indices]['search_term'].values

# Example usage
search_index = 0  # Change based on actual search index
recommended_symptoms = get_similar_symptoms(search_index)
print("Recommended Symptoms:", recommended_symptoms)

Recommended Symptoms: ['มีเสมหะ, ไอ' 'มีเสมหะ, ไอ' 'มีเสมหะ, เสียงแหบ'
 'กลืนเจ็บ, คันคอมีเสมหะ, เจ็บคอ, เสมหะไหลลงคอ, ไอ, ไอกลางคืน, '
 'เสมหะไหลลงคอ, มีเสมหะ']


# **Merge Rare Symptoms into an "Other" Category**

In [19]:
# Count occurrences of each search_term
term_counts = Symptoms['search_term'].value_counts()

# Define a threshold (e.g., classes with ≤2 samples)
threshold = 2

# Replace rare search_term values with "Other"
Symptoms['search_term'] = Symptoms['search_term'].apply(lambda x: x if term_counts[x] > threshold else "Other")

# Check updated value counts
print(Symptoms['search_term'].value_counts())

# Proceed with PyCaret setup
df_pycaret = Symptoms[['symptoms_note', 'search_term']]
clf = setup(df_pycaret, target='search_term', session_id=1234, text_features=['symptoms_note'])

search_term
Other                     364
ไอ                         62
เจ็บคอ                     51
ไข้                        40
ปวดท้อง                    23
                         ... 
ไอ, น้ำมูกไหลมีเสมหะ,       3
กระแทก                      3
ไข้, ปวดหัว                 3
กลืนลำบาก                   3
มีเสมหะ, ไอน้ำมูกไหล,       3
Name: count, Length: 70, dtype: int64


Unnamed: 0,Description,Value
0,Session id,1234
1,Target,search_term
2,Target type,Multiclass
3,Target mapping,"Fever: 0, Other: 1, Sore throat: 2, cough: 3, skin rash: 4, กระแทก: 5, กลืนลำบาก: 6, ก้อนที่ผิวหนัง: 7, คัดจมูก: 8, คัดจมูก, น้ำมูกไหลมีเสมหะ, : 9, คัน: 10, คัน, ผื่น: 11, จุกแน่นท้อง: 12, จุกแน่นท้อง, ปวดท้อง: 13, ชา: 14, ตัวร้อน: 15, ตัวร้อน, ไข้: 16, ตาบวม: 17, ตาแดง: 18, ถ่ายเหลว, ท้องเสีย: 19, ท้องเสีย: 20, ท้องเสีย, ปวดท้อง: 21, น้ำมูกไหล: 22, น้ำมูกไหล, มีเสมหะ: 23, น้ำมูกไหล, มีเสมหะไอ, : 24, น้ำมูกไหล, ไอ: 25, น้ำมูกไหล, ไอมีเสมหะ, : 26, บ้านหมุน: 27, บ้านหมุน, มึนศีรษะ: 28, ปวดกระดูก: 29, ปวดข้อ: 30, ปวดข้อนิ้วเท้า: 31, ปวดข้อมือ: 32, ปวดข้อเท้า: 33, ปวดคอ: 34, ปวดตา: 35, ปวดท้อง: 36, ปวดหลัง: 37, ปวดหัว: 38, ปวดหัวไหล่: 39, ปวดหู: 40, ปวดเข่า: 41, ปวดเท้า: 42, ปวดเมื่อยกล้ามเนื้อทั่วๆ: 43, ปวดแขน: 44, ผื่น: 45, ผื่น, คัน: 46, มีเสมหะ: 47, มีเสมหะ, น้ำมูกไหล: 48, มีเสมหะ, เจ็บคอ: 49, มีเสมหะ, ไอ: 50, มีเสมหะ, ไอน้ำมูกไหล, : 51, มึนศีรษะ: 52, หูอื้อ: 53, เคืองตา: 54, เจ็บคอ: 55, เจ็บคอ, มีเสมหะ: 56, เสียงดังในหู: 57, ไข้: 58, ไข้, ตัวร้อน: 59, ไข้, น้ำมูกไหล: 60, ไข้, ปวดหัว: 61, ไข้, เจ็บคอ: 62, ไข้, ไอ: 63, ไอ: 64, ไอ, น้ำมูกไหล: 65, ไอ, น้ำมูกไหลมีเสมหะ, : 66, ไอ, มีเสมหะ: 67, ไอ, มีเสมหะน้ำมูกไหล, : 68, ไอ, เจ็บคอ: 69"
4,Original data shape,"(1000, 2)"
5,Transformed data shape,"(1000, 335)"
6,Transformed train set shape,"(700, 335)"
7,Transformed test set shape,"(300, 335)"
8,Text features,1
9,Preprocess,True


# **Knowledge-Based Filtering with PyCaret**

In [20]:
# PyCaret Classification Model Setup
df_pycaret = Symptoms[['symptoms_note', 'search_term']]
clf = setup(df_pycaret, target='search_term', session_id=1234, text_features=['symptoms_note'])

# Train the best model
best_model = compare_models()

Unnamed: 0,Description,Value
0,Session id,1234
1,Target,search_term
2,Target type,Multiclass
3,Target mapping,"Fever: 0, Other: 1, Sore throat: 2, cough: 3, skin rash: 4, กระแทก: 5, กลืนลำบาก: 6, ก้อนที่ผิวหนัง: 7, คัดจมูก: 8, คัดจมูก, น้ำมูกไหลมีเสมหะ, : 9, คัน: 10, คัน, ผื่น: 11, จุกแน่นท้อง: 12, จุกแน่นท้อง, ปวดท้อง: 13, ชา: 14, ตัวร้อน: 15, ตัวร้อน, ไข้: 16, ตาบวม: 17, ตาแดง: 18, ถ่ายเหลว, ท้องเสีย: 19, ท้องเสีย: 20, ท้องเสีย, ปวดท้อง: 21, น้ำมูกไหล: 22, น้ำมูกไหล, มีเสมหะ: 23, น้ำมูกไหล, มีเสมหะไอ, : 24, น้ำมูกไหล, ไอ: 25, น้ำมูกไหล, ไอมีเสมหะ, : 26, บ้านหมุน: 27, บ้านหมุน, มึนศีรษะ: 28, ปวดกระดูก: 29, ปวดข้อ: 30, ปวดข้อนิ้วเท้า: 31, ปวดข้อมือ: 32, ปวดข้อเท้า: 33, ปวดคอ: 34, ปวดตา: 35, ปวดท้อง: 36, ปวดหลัง: 37, ปวดหัว: 38, ปวดหัวไหล่: 39, ปวดหู: 40, ปวดเข่า: 41, ปวดเท้า: 42, ปวดเมื่อยกล้ามเนื้อทั่วๆ: 43, ปวดแขน: 44, ผื่น: 45, ผื่น, คัน: 46, มีเสมหะ: 47, มีเสมหะ, น้ำมูกไหล: 48, มีเสมหะ, เจ็บคอ: 49, มีเสมหะ, ไอ: 50, มีเสมหะ, ไอน้ำมูกไหล, : 51, มึนศีรษะ: 52, หูอื้อ: 53, เคืองตา: 54, เจ็บคอ: 55, เจ็บคอ, มีเสมหะ: 56, เสียงดังในหู: 57, ไข้: 58, ไข้, ตัวร้อน: 59, ไข้, น้ำมูกไหล: 60, ไข้, ปวดหัว: 61, ไข้, เจ็บคอ: 62, ไข้, ไอ: 63, ไอ: 64, ไอ, น้ำมูกไหล: 65, ไอ, น้ำมูกไหลมีเสมหะ, : 66, ไอ, มีเสมหะ: 67, ไอ, มีเสมหะน้ำมูกไหล, : 68, ไอ, เจ็บคอ: 69"
4,Original data shape,"(1000, 2)"
5,Transformed data shape,"(1000, 335)"
6,Transformed train set shape,"(700, 335)"
7,Transformed test set shape,"(300, 335)"
8,Text features,1
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.6,0.0,0.6,0.5234,0.5471,0.5192,0.5243,0.364
rf,Random Forest Classifier,0.5986,0.0,0.5986,0.5307,0.5499,0.5211,0.5253,0.331
xgboost,Extreme Gradient Boosting,0.5943,0.0,0.5943,0.5229,0.5459,0.5134,0.5183,3.254
gbc,Gradient Boosting Classifier,0.5843,0.0,0.5843,0.5292,0.5436,0.5103,0.514,25.123
dt,Decision Tree Classifier,0.58,0.0,0.58,0.5267,0.5393,0.5047,0.5087,0.109
lightgbm,Light Gradient Boosting Machine,0.58,0.0,0.58,0.5138,0.5326,0.4988,0.5028,7.385
svm,SVM - Linear Kernel,0.5686,0.0,0.5686,0.5027,0.5128,0.4816,0.4941,0.277
ridge,Ridge Classifier,0.5286,0.0,0.5286,0.3969,0.4422,0.4199,0.4286,0.053
lr,Logistic Regression,0.4986,0.0,0.4986,0.3214,0.3779,0.3134,0.357,2.169
knn,K Neighbors Classifier,0.4829,0.0,0.4829,0.4215,0.4322,0.3722,0.3777,0.054


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

# **Hybrid Recommendation System**

In [21]:
def hybrid_recommendation(search_index, top_n=5):
    # Content-Based Filtering
    content_recommendations = get_similar_symptoms(search_index, top_n)

    # Knowledge-Based Filtering (PyCaret Model Prediction)
    test_data = pd.DataFrame({'symptoms_note': [Symptoms.iloc[search_index]['symptoms_note']]})
    prediction = predict_model(best_model, data=test_data)

    # Print the prediction DataFrame to debug column names
    print(prediction.head())

    # Extract the correct prediction column
    predicted_column = 'Label' if 'Label' in prediction.columns else 'prediction_label'
    knowledge_recommendation = prediction[predicted_column].values[0]

    # Combine Results
    final_recommendations = list(set(content_recommendations) | {knowledge_recommendation})
    return final_recommendations

# Example Usage
search_index = 0
final_recommendations = hybrid_recommendation(search_index)
print("Hybrid Recommended Symptoms:", final_recommendations)

                                       symptoms_note prediction_label  \
0  ลักษณะ   เสมหะ สีเหลือง / เขียว ,   ระยะเวลา  ...      มีเสมหะ, ไอ   

   prediction_score  
0               1.0  
Hybrid Recommended Symptoms: ['มีเสมหะ, ไอ', 'Other']
