In [12]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dataset-clean/diseases_details_clean.csv


In [14]:
import chardet

with open("/kaggle/input/dataset-clean/diseases_details_clean.csv", 'rb') as f:
    result = chardet.detect(f.read(100000))
    print(result)


{'encoding': 'Windows-1252', 'confidence': 0.7277909389590178, 'language': ''}


In [16]:
import pandas as pd

df = pd.read_csv("/kaggle/input/dataset-clean/diseases_details_clean.csv", encoding="Windows-1252", encoding_errors='ignore')

print(df.shape)

print(df.columns)

(10143, 6)
Index(['name', 'description', 'signs_and_symptoms', 'diagnostic_test',
       'treatment', 'url'],
      dtype='object')


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10143 entries, 0 to 10142
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   name                10143 non-null  object
 1   description         9782 non-null   object
 2   signs_and_symptoms  10143 non-null  object
 3   diagnostic_test     9813 non-null   object
 4   treatment           9685 non-null   object
 5   url                 10143 non-null  object
dtypes: object(6)
memory usage: 475.6+ KB


In [18]:

# Tỷ lệ missing value từng cột
missing_ratio = df.isnull().mean().sort_values(ascending=False)
print("Missing value ratio:\n", missing_ratio)

# Kiểm tra số lượng giá trị duy nhất
print(df.nunique())

Missing value ratio:
 treatment             0.045154
description           0.035591
diagnostic_test       0.032535
name                  0.000000
signs_and_symptoms    0.000000
url                   0.000000
dtype: float64
name                  10141
description            9779
signs_and_symptoms    10139
diagnostic_test        9810
treatment              9680
url                   10143
dtype: int64


In [21]:
# Loại bỏ khoảng trắng và thay chuỗi rỗng thành NaN
df_temp = df.replace(r'^\s*$', pd.NA, regex=True)

# Lấy danh sách cột trừ cột đầu tiên và cuối cùng
columns_to_check = df_temp.columns[1:-1]

# Tìm các dòng mà tất cả các cột này đều rỗng (NaN)
empty_rows = df_temp[columns_to_check][df_temp[columns_to_check].isna().all(axis=1)]

# Kết quả
print("Số dòng trống toàn bộ (trừ cột đầu và cuối):", len(empty_rows))
print("Chỉ số các dòng:", empty_rows.index.tolist())


Số dòng trống toàn bộ (trừ cột đầu và cuối): 0
Chỉ số các dòng: []


In [22]:
import pandas as pd
import re
import string
import nltk
from nltk.stem import WordNetLemmatizer

# Tải tài nguyên cần thiết
nltk.download("wordnet")

lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if pd.isnull(text): return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    words = [lemmatizer.lemmatize(w) for w in text.split()]
    return " ".join(words)

# === Lựa chọn các cột cần chuẩn hóa (bỏ 'url') ===
columns_to_clean = [col for col in df.select_dtypes(include='object').columns if col != "url"]

# === Áp dụng hàm clean_text cho từng cột ===
for col in columns_to_clean:
    df[col + "_clean"] = df[col].astype(str).apply(clean_text)

# === Lưu ra file mới nếu cần ===
df.to_csv("cleaned_disease_dataset.csv", index=False)


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [27]:
import csv
import json
import re

input_csv_path = "/kaggle/input/dataset-clean/diseases_details_clean.csv"
output_json_path = "/kaggle/working/diseases_cleaned_with_symptom_list.json"

def normalize_text(text):
    if not text or str(text).strip() == "":
        return ""
    text = str(text)
    text = re.sub(r'\[\d+\]', '', text)
    text = text.replace("\\n", "\n").replace("\r", "")
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def extract_symptoms_safe(text):
    symptoms = []
    text = str(text)

    for line in text.splitlines():
        if ":" in line:
            left = line.split(":", 1)[0].strip()
            if len(left.split()) <= 6 and len(left) > 2:
                symptoms.append(left)

    if "Additional Symptoms" in text:
        after_additional = text.split("Additional Symptoms", 1)[1]
        for line in after_additional.splitlines():
            line = line.strip("-•* :\n\t")
            if line and len(line) < 100 and line.lower() not in [s.lower() for s in symptoms]:
                symptoms.append(line)

    seen = set()
    unique_symptoms = []
    for s in symptoms:
        key = s.lower()
        if key not in seen:
            seen.add(key)
            unique_symptoms.append(s.strip())

    return unique_symptoms

# === Đọc từng dòng, xử lý từng dòng, và ghi sang JSON để tiết kiệm bộ nhớ ===
with open(input_path, mode='r', encoding='Windows-1252', errors='ignore') as infile, \
     open(output_path, mode='w', encoding='utf-8') as outfile:

    reader = csv.DictReader(infile)
    outfile.write("[\n")
    first = True

    for row in reader:
        cleaned = {}
        for k, v in row.items():
            k_clean = k.strip()
            if k_clean.lower() == "url":
                cleaned[k_clean] = v.strip() if v else ""
            else:
                cleaned[k_clean] = normalize_text(v)

        # Trích danh sách triệu chứng
        cleaned["symptoms_list"] = extract_symptoms_safe(row.get("signs_and_symptoms", ""))

        if not first:
            outfile.write(",\n")
        else:
            first = False

        json.dump(cleaned, outfile, ensure_ascii=False, indent=2)

    outfile.write("\n]")

print("✅ Xử lý thành công. File kết quả:", output_json_path)


✅ Xử lý thành công. File kết quả: /kaggle/working/diseases_cleaned_with_symptom_list.json
