In [1]:
!pip install --upgrade scikit-learn



In [2]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score,mean_squared_error, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

In [3]:
Symptoms = pd.read_csv('/content/drive/MyDrive/[CONFIDENTIAL] AI symptom picker data (Agnos candidate assignment) - ai_symptom_picker.csv')

In [4]:
Symptoms.head()

Unnamed: 0,gender,age,summary,search_term
0,male,28,"{""diseases"": [], ""procedures"": [], ""no_symptom...","มีเสมหะ, ไอ"
1,male,27,"{""diseases"": [], ""procedures"": [], ""no_symptom...","ไอ, น้ำมูกไหล"
2,female,26,"{""diseases"": [], ""procedures"": [], ""no_symptom...",ปวดท้อง
3,male,42,"{""diseases"": [], ""procedures"": [], ""no_symptom...",น้ำมูกไหล
4,female,40,"{""diseases"": [], ""procedures"": [], ""no_symptom...",ตาแห้ง


# **Extract Text**

In [5]:
def extract_answers(summary):
    """
    Extracts the 'answers' values from the 'yes_symptoms' list in a JSON-like string.
    """
    try:
        data = eval(summary)  # Evaluate the string as a Python dictionary
        if isinstance(data, dict) and 'yes_symptoms' in data:
            answers = []
            for symptom in data['yes_symptoms']:
                if 'answers' in symptom:
                    answers.extend(symptom['answers'])
            return ', '.join(answers)  # Join the answers with commas
        else:
            return None  # Return None if the structure is not as expected
    except (SyntaxError, TypeError, KeyError):
        return None  # Return None if there's an error parsing or accessing the data

In [6]:
def process_dataframe(df, summary_col='summary', symptoms_note_col='symptoms_note'):
    """
    Processes a DataFrame to extract 'answers' and create a new 'symptoms_note' column.
    """
    df[symptoms_note_col] = df[summary_col].apply(extract_answers)
    return df

In [7]:
# Process the DataFrame
Symptoms = process_dataframe(Symptoms)

In [8]:
Symptoms.head()

Unnamed: 0,gender,age,summary,search_term,symptoms_note
0,male,28,"{""diseases"": [], ""procedures"": [], ""no_symptom...","มีเสมหะ, ไอ","ลักษณะ เสมหะเปลี่ยนสีเหลือง/เขียว, ระยะเวลา ไม..."
1,male,27,"{""diseases"": [], ""procedures"": [], ""no_symptom...","ไอ, น้ำมูกไหล","ระยะเวลา 1-3 สัปดาห์, ลักษณะ ไอไม่มีเสมหะ ไอแห..."
2,female,26,"{""diseases"": [], ""procedures"": [], ""no_symptom...",ปวดท้อง,"บริเวณ รอบๆสะดือ, ระยะเวลา ตั้งแต่ 1 วัน ถึง 1..."
3,male,42,"{""diseases"": [], ""procedures"": [], ""no_symptom...",น้ำมูกไหล,"ระยะเวลา น้อยกว่า 10 วัน, ประวัติ ATK ยังไม่ได..."
4,female,40,"{""diseases"": [], ""procedures"": [], ""no_symptom...",ตาแห้ง,การรักษาก่อนหน้า ไม่เคย


In [9]:
Symptoms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   gender         1000 non-null   object
 1   age            1000 non-null   int64 
 2   summary        1000 non-null   object
 3   search_term    1000 non-null   object
 4   symptoms_note  1000 non-null   object
dtypes: int64(1), object(4)
memory usage: 39.2+ KB


# **Hybrid Recommendation** consists of Content based Filtering Recommender System and Knowledge Filtering

In [10]:
def hybrid_recommendation(Symptoms, symptoms_note_col='symptoms_note', search_term_col='search_term'):
    """
    Hybrid Recommendation System using Content-based and Knowledge-based filtering.

    Args:
        df (pd.DataFrame): Input DataFrame with 'symptoms_note' and 'search_term' columns.
        symptoms_note_col (str): Name of the column containing symptom notes.
        search_term_col (str): Name of the column containing search terms (disease symptoms).

    Returns:
        tuple: Predicted search terms (disease symptoms) and evaluation metrics.
    """

    # 1. Content-Based Filtering (using symptoms_note)
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(Symptoms[symptoms_note_col].fillna(''))  # Handle NaN values

    # For Content-based, we'll use a regressor to predict relevance scores
    # Create target variable: For simplicity, let's use the length of search_term as a proxy for relevance.
    # You might need a more sophisticated relevance score based on your data.
    target_relevance = Symptoms[search_term_col].fillna('').apply(len)

    X_train_content, X_test_content, y_train_content, y_test_content = train_test_split(tfidf_matrix, target_relevance, test_size=0.2, random_state=42)

    content_model = RandomForestRegressor(n_estimators=100, random_state=42)
    content_model.fit(X_train_content, y_train_content)
    content_predictions = content_model.predict(X_test_content)

    # 2. Knowledge-Based Filtering (using search_term categories as labels)
    # Assuming search_term is categorical (you might need to preprocess it if it's not)
    # For Knowledge-based, we'll use a classifier to predict the search_term category.
    # If search_term is not categorical, you'll need to transform it into categories.

    # Example: If search_term is already categorical, you can use it directly.
    # If not, you might need to create categories based on keywords or other criteria.

    X_train_knowledge, X_test_knowledge, y_train_knowledge, y_test_knowledge = train_test_split(Symptoms[symptoms_note_col].fillna(''), Symptoms[search_term_col].fillna(''), test_size=0.2, random_state=42)

    knowledge_vectorizer = TfidfVectorizer()
    X_train_knowledge_vectorized = knowledge_vectorizer.fit_transform(X_train_knowledge)
    X_test_knowledge_vectorized = knowledge_vectorizer.transform(X_test_knowledge)

    knowledge_model = RandomForestClassifier(n_estimators=100, random_state=42)
    knowledge_model.fit(X_train_knowledge_vectorized, y_train_knowledge)
    knowledge_predictions = knowledge_model.predict(X_test_knowledge_vectorized)

    # 3. Hybridization (Combine Content-based and Knowledge-based results)
    # Simple averaging (You can use more sophisticated methods like weighted averaging)
    hybrid_predictions = []
    for i in range(len(X_test_knowledge)):
        if content_predictions[i] > 2:  # Example threshold based on relevance
            hybrid_predictions.append(knowledge_predictions[i])
        else:
            hybrid_predictions.append(y_test_knowledge.iloc[i])  # Use actual search_term if content relevance is low

    # Evaluation Metrics (for Knowledge-based, since it's classification)
    accuracy = accuracy_score(y_test_knowledge, knowledge_predictions)
    precision = precision_score(y_test_knowledge, knowledge_predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test_knowledge, knowledge_predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test_knowledge, knowledge_predictions, average='weighted', zero_division=0)

    # ROC Curve and AUC (for multi-class classification, use one-vs-rest)
    y_test_binarized = pd.get_dummies(y_test_knowledge)
    knowledge_probabilities = knowledge_model.predict_proba(X_test_knowledge_vectorized)

    fpr = dict()
    tpr = dict()

    # RMSE (for Content-based, since it's regression)
    rmse = mean_squared_error(y_test_content, content_predictions) ** 0.5 # Manual RMSE calculation

    return hybrid_predictions, {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'rmse': rmse
    }

In [11]:
predicted_symptoms, metrics = hybrid_recommendation(Symptoms)

In [12]:
print("Predicted Symptoms:", predicted_symptoms)

Predicted Symptoms: ['ไอ', 'ปวดเข่า', 'ปวดข้อ', 'Sore throat, Dry throat', 'ตัวร้อน', 'ผื่น', 'ปวดท้อง', 'ปวดคอ', 'แสบท้อง, ปวดท้อง', 'เจ็บคอ', 'Fever', 'คันจมูก, จมูกตัน', 'ไอ, เจ็บคอ', 'คัน', 'ไอ', 'Dizzy, Eye irritation', 'ท้องเสีย', 'ผื่น', 'skin rash', 'ผื่น, ไข้', 'ไอ, มีเสมหะ', 'ไข้', 'ท้องเสีย', 'ปวดตา', 'ปวดท้อง', 'ตาแดง', 'น้ำมูกไหล', 'ร้อนวูบวาบ, ไข้', 'ปวดเข่า', 'ไอ', 'แสบท้อง, ปวดท้อง', 'ผื่น', 'Eye discharge', 'เจ็บคอ', 'ปวดข้อ', 'Sore throat, Dry throat', 'ปวดท้อง', 'เจ็บคอ', 'หายใจไม่ออก, แน่นจมูกเลือดกำเดาไหล, ', 'ไอ', 'ไข้', 'ปวดตา, ตาบวม', 'มีเสมหะ, เจ็บคอ', 'มีเสมหะ, ไอ', 'คัน', 'เจ็บคอ', 'เจ็บคอ', 'น้ำมูกไหล', 'น้ำมูกไหล', 'ปวดท้อง', 'armpit lump, skin lumpAcne, ', 'ตาแดง', 'ผื่น', 'เจ็บคอ', 'ไอ, หายใจหอบเหนื่อยเหนื่อย, ', 'ไอ', 'ก้อนที่ผิวหนัง', 'ตาแดง', 'ไอ, น้ำมูกไหล', 'ไอ, น้ำมูกไหลมีเสมหะ, ', 'ปวดหลัง', 'ไอ, น้ำมูกไหล', 'เจ็บคอ', 'ตาแดง', 'คันจมูก, จมูกตัน', 'ไอ', 'คันคอ, ไอ', 'คัน, ผื่น', 'ปวดท้อง, คลื่นไส้อาเจียน, ', 'ไข้, มึนศีรษะ', 'ไข้', 'ตาแดง', 'Fever',

In [13]:
print("Evaluation Metrics:", metrics)

Evaluation Metrics: {'accuracy': 0.41, 'precision': 0.34213484432234437, 'recall': 0.41, 'f1': 0.3584266557197591, 'rmse': 8.490742803631553}
