<a href="https://colab.research.google.com/github/lili0706/final_project/blob/main/DL_finalPJ_using_description.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score, hamming_loss
from keras.metrics import binary_accuracy
from tensorflow.keras.metrics import BinaryAccuracy

In [None]:
# Step 1: Load data
diseases_url = "https://storage.googleapis.com/kagglesdsdata/datasets/5609522/9269505/diseasesInfo.csv?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20241206%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20241206T060642Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=145d74bf3aab302ba3bd874fd62c49e4dcb3bb17649c8ec3f5d63670e271d0c2063be5dc42ed4f10c0bbcb9976fb51d858eadb21ad786072289dededcbf92afd8c04cdc8916b5dd2e327dc705fb334fdaa422a9807a33efd0a55fa008d6e4e9a86650ba350a85b0efebb97035e4e12ea3d721c6f085c3c7dba82bbbda8a45361fe4c7fbebc7fb0d74e9f4e03fef85bdaf1198b2132c4840f7fef997e561882623ba9eaad06ef531d2b4f63f49e97a89c2a946917158ddf53fa366619cff52ad525d299ad03a6eea91b3c1cf555a9f45c213dac0685beab4c8b5bd310168c2084a8284792636d38ed730d09ab6343a8bca35318f1b67b19a6121a008743862d39"
drugs_url = "https://storage.googleapis.com/kagglesdsdata/datasets/5609522/9269505/drugsInfo.csv?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20241206%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20241206T060940Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=0d27b819a1cd8d35819c83d3928085eae33c195d7929bed3eabeb37a508ba3d7d9d6c42941743137c45f22946efc74299fd87cecb5f347cb22bb744bd9a9e47aa71be72574c05515470a669862af2978c6a7b77bd6c10162e4880f0880ac7d5b89586161c68e05f8c1b1e4abbb6f88753c1ec4a7a1d819c2000892b710ce18e2a6d185b0865249c7536382c6a539e522a0b1d0e4dcbaa3d9df2a1254d03a20f7c419c729a06a8c4214ba5ce3d1919fbe79c1ff0187c16dec7c875d42ee9e4f020c51a80fef86f1375503848f25fa42c944500487b1c7d3196524af415fbd13364395efdfdf273018132482c360632498512bb9ca249832cfcd5b4750feaf91d3"
mapping_url = "https://storage.googleapis.com/kagglesdsdata/datasets/5609522/9269505/mapping.csv?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20241206%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20241206T061003Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=17fa4180c51b257256c5b80f029aaa5851103abc3f709b296235f3520a89cf7b67e864aa32ad7403d4c302d9bd4a2c5d72c2b616f90c4fb5c7230fbff1e577c349203717567e08b66f03a8ecc0da86c7134ac739a2c35810bcf74ec13d6d6696d636427572552e5e604dcf7a6d6e34124783f28c2c463cac482ce4db59e1b49dc29d157a427b2fbcf8fb2e1a50073e77f9ad97535778d074561646d9e67399060952b0e0705edeb02a5335db6d65677310ef6d201f55f7565a50bda028522b7b56526a94339a5b810c36bcb5777344cceed1a59befd6f71b5a8646a690b4914cec73e332e2b2b57bb7887d2ba62923564a0d15613fdb2a285e2349085b73df99"
diseases_info = pd.read_csv(diseases_url)
drugs_info = pd.read_csv(drugs_url)
mapping = pd.read_csv(mapping_url)

# Step 2: Filter cancer-related diseases
cancer_diseases = diseases_info[diseases_info['SlimMapping'].str.contains('Cancer', na=False)]
cancer_disease_ids = cancer_diseases['DiseaseID'].unique()

# Step 3: Find DrugID for cancer diseases
cancer_drug_mapping = mapping[mapping['DiseaseID'].isin(cancer_disease_ids)]
cancer_drug_ids = cancer_drug_mapping['DrugID'].unique()


In [None]:
# Step 4: Prepare label matrix
labels = {}
for drug_id in cancer_drug_ids:
    related_diseases = cancer_drug_mapping[cancer_drug_mapping['DrugID'] == drug_id]['DiseaseID']
    for disease_id in related_diseases:
        if disease_id not in labels:
            labels[disease_id] = []
        labels[disease_id].append(drug_id)

# Create the multi-label matrix for diseases and drugs
num_diseases = len(cancer_disease_ids)
num_drugs = len(cancer_drug_ids)

label_matrix = np.zeros((num_diseases, num_drugs))
disease_to_idx = {disease_id: idx for idx, disease_id in enumerate(cancer_disease_ids)}
drug_to_idx = {drug_id: idx for idx, drug_id in enumerate(cancer_drug_ids)}

for disease_id, drug_ids in labels.items():
    disease_idx = disease_to_idx[disease_id]
    for drug_id in drug_ids:
        drug_idx = drug_to_idx[drug_id]
        label_matrix[disease_idx, drug_idx] = 1

In [None]:
# Step 5: Prepare input data (disease descriptions to numerical features)
vectorizer = TfidfVectorizer(max_features=1000)
input_disease_data = vectorizer.fit_transform(cancer_diseases['DiseaseDescription']).toarray()

# Step 6: Combine Disease Descriptions and Drug Descriptions
drug_vectorizer = TfidfVectorizer(max_features=1000)
input_drug_data = drug_vectorizer.fit_transform(drugs_info['DrugDescription']).toarray()

# 對每個疾病將相關藥物的特徵組合在一起
input_combined_data = np.zeros((num_diseases, input_disease_data.shape[1] + num_drugs))

for disease_idx, disease_id in enumerate(cancer_disease_ids):
    # Add disease features
    input_combined_data[disease_idx, :input_disease_data.shape[1]] = input_disease_data[disease_idx]

    # Add drug features for the current disease
    for drug_idx in range(num_drugs):
        if label_matrix[disease_idx, drug_idx] == 1:  # If the drug is related to this disease
            input_combined_data[disease_idx, input_disease_data.shape[1] + drug_idx] = 1

In [None]:
# Step 7: Train/Test split
X_train, X_test, y_train, y_test = train_test_split(input_combined_data, label_matrix, test_size=0.2, random_state=42)

# Step 8: Build the model
input_combined = layers.Input(shape=(X_train.shape[1],))
x = layers.Dense(128, activation='relu')(input_combined)
x = layers.Dropout(0.5)(x)
x = layers.Dense(64, activation='relu')(x)
x = layers.Dropout(0.5)(x)
output_drugs = layers.Dense(num_drugs, activation='sigmoid')(x)

model = models.Model(inputs=input_combined, outputs=output_drugs)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy'])

# Step 9: Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)



Epoch 1/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 59ms/step - binary_accuracy: 0.5284 - loss: 0.6921 - val_binary_accuracy: 0.7106 - val_loss: 0.6845
Epoch 2/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - binary_accuracy: 0.6701 - loss: 0.6809 - val_binary_accuracy: 0.8439 - val_loss: 0.6676
Epoch 3/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - binary_accuracy: 0.7728 - loss: 0.6624 - val_binary_accuracy: 0.9196 - val_loss: 0.6326
Epoch 4/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - binary_accuracy: 0.8528 - loss: 0.6247 - val_binary_accuracy: 0.9533 - val_loss: 0.5694
Epoch 5/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - binary_accuracy: 0.9175 - loss: 0.5597 - val_binary_accuracy: 0.9608 - val_loss: 0.4789
Epoch 6/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - binary_accuracy: 0.9508 - loss: 0.4798 - val_binar

In [None]:
# Step 10: Evaluate the model on the test set
y_pred = model.predict(X_test)

# 檢查預測值分布
# print("Prediction distribution:")
# print(f"Min: {y_pred.min()}, Max: {y_pred.max()}, Mean: {y_pred.mean()}")

# 使用固定的閥值 0.1
threshold = 0.1
y_pred_bin = (y_pred > threshold).astype(int)

# 計算評估指標
f1 = f1_score(y_test, y_pred_bin, average='samples')
precision = precision_score(y_test, y_pred_bin, average='samples')
recall = recall_score(y_test, y_pred_bin, average='samples')
hamming = hamming_loss(y_test, y_pred_bin)

# 使用 Binary Accuracy
binary_acc = BinaryAccuracy()
binary_acc.update_state(y_test, y_pred_bin)
accuracy = binary_acc.result().numpy()

print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Hamming Loss: {hamming:.4f}")
print(f"Accuracy: {accuracy:.4f}")

# 輸出最終結果
auc = roc_auc_score(y_test, y_pred, average='samples')
print(f"AUC: {auc:.4f}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
F1 Score: 0.2306
Precision: 0.3002
Recall: 0.3470
Hamming Loss: 0.0684
Accuracy: 0.9316
AUC: 0.8125
