In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Install Library

In [None]:
!pip install datasets

In [None]:
!pip install torch

# Import Library

In [42]:
from datasets import load_dataset
from torch.utils.data import random_split
from collections import Counter
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load Dataset

In [16]:
dataset = load_dataset("fhirfly/medicalquestions", 'train')
dataset = dataset['train']

In [17]:
print(dataset[:10])
print(type(dataset))

{'label': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'text': ['What is (are) Anal Cancer ?', 'How to prevent Anal Cancer ?', 'What is (are) Adult Central Nervous System Tumors ?', 'What is (are) Childhood Brain and Spinal Cord Tumors ?', 'What is (are) Childhood Astrocytomas ?', 'What is (are) Childhood Central Nervous System Atypical Teratoid/Rhabdoid Tumor ?', 'What is (are) Childhood Brain Stem Glioma ?', 'What is (are) Childhood Central Nervous System Embryonal Tumors ?', 'What is (are) Childhood Central Nervous System Germ Cell Tumors ?', 'What is (are) Childhood Craniopharyngioma ?']}
<class 'datasets.arrow_dataset.Dataset'>


# Train Test Split Dataset

In [18]:
# Tentukan ukuran untuk set pelatihan dan validasi
training_size = int(0.8 * len(dataset))
testing_size = len(dataset) - training_size

# Bagi dataset menjadi set pelatihan dan validasi
training_dataset, testing_dataset = random_split(dataset, [training_size, testing_size])

print(f"Jumlah data pelatihan: {len(training_dataset)}")
print(f"Jumlah data validasi: {len(testing_dataset)}")


Jumlah data pelatihan: 20008
Jumlah data validasi: 5003


In [22]:
print(training_dataset[:10])
print(type(training_dataset))

{'label': [1, 1, 1, 0, 0, 1, 1, 1, 1, 1], 'text': ['What is (are) Ainhum ?', 'What is (are) Mantle cell lymphoma ?', 'What are the symptoms of Impairment of oral perception ?', 'what hospitals are in sydney?', 'wojciech todorow is from which nation?', 'What is (are) the active ingredient in the drug iothalamic acid?', 'What is (are) Marshall-Smith syndrome ?', 'What is (are) Primrose syndrome ?', 'How do I handle nosebleeds in children?', 'What is (are) CADASIL ?']}
<class 'torch.utils.data.dataset.Subset'>


In [35]:
listTrainingDataset = list(training_dataset)
labelDatatraining = [training_dataset[i]['label'] for i in range(len(training_dataset))]
textDatatraining = [training_dataset[i]['text'] for i in range(len(training_dataset))]
listTestingDataset = list(testing_dataset)
labelDatatesting = [testing_dataset[i]['label'] for i in range(len(testing_dataset))]
textDatatesting = [testing_dataset[i]['text'] for i in range(len(testing_dataset))]

In [36]:
uniqueDataTraining = set(labelDatatraining)
print(uniqueDataTraining)
uniqueTotalTraining = Counter(labelDatatraining)
print(uniqueTotalTraining)

uniqueDataTesting = set(labelDatatesting)
print(uniqueDataTesting)
uniqueTotalTesting = Counter(labelDatatesting)
print(uniqueTotalTesting)

{0, 1}
Counter({0: 10173, 1: 9835})
{0, 1}
Counter({0: 2581, 1: 2422})


In [37]:
training_sentences = []
training_labels = []
testing_sentences = []
testing_labels = []

In [38]:
for s in training_dataset:
    training_sentences.append(s.get('text'))
for l in training_dataset:
    training_labels.append(l.get('label'))
for s in testing_dataset:
    testing_sentences.append(s.get('text'))
for l in testing_dataset:
    testing_labels.append(l.get('label'))

vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
oov_tok = "<OOV>"

In [40]:
print(type(training_labels[:10]))

<class 'list'>


In [None]:
# Create and fit the tokenizer with training data
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

# Convert sentences to sequences of tokens
training_sequences = tokenizer.texts_to_sequences(training_sentences)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)

# Pad sequences to a fixed length
training_padded = pad_sequences(
    training_sequences,
    maxlen=max_length,
    truncating=trunc_type
)
training_padded = tf.convert_to_tensor(training_padded)

testing_padded = pad_sequences(
    testing_sequences,
    maxlen=max_length
)
testing_padded = tf.convert_to_tensor(testing_padded)

# Convert labels to numpy arrays
training_labels = np.array(training_labels)
testing_labels = np.array(testing_labels)

# Convert labels to tensors
training_labels = tf.convert_to_tensor(training_labels)
testing_labels = tf.convert_to_tensor(testing_labels)

# Create the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

model.fit(
    training_padded,
    training_labels,
    epochs=50,
    verbose=2,
    validation_data=(testing_padded, testing_labels)
)

# The code below is to save your model as a .h5 file.
# It will be saved automatically in your Submission folder.
if __name__ == '__main__':
    # DO NOT CHANGE THIS CODE
    model.save("/content/drive/Shareddrives/NLP/Model Output/model_50-epochs.h5")


In [44]:
loss, accuracy = model.evaluate(testing_padded, testing_labels)
print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")


Loss: 0.03614393249154091
Accuracy: 0.9940035939216614


In [92]:
model = tf.keras.models.load_model('/content/drive/Shareddrives/NLP/Model Output/model_50-epochs.h5')  # Ganti dengan path model yang benar

In [107]:
new_data = ["How does the process of heart attack?"]
new_sequences = tokenizer.texts_to_sequences(new_data)
new_padded = pad_sequences(new_sequences, maxlen=max_length)

In [108]:
predictions = model.predict(new_padded)
predicted_class = (predictions >= 0.6).astype("int32")
print(predicted_class)

[[0]]


In [109]:
# Menampilkan probabilitas untuk setiap kelas
probabilities = tf.nn.sigmoid(predictions)
prob_class_1 = probabilities[0][0].numpy()  # Probabilitas kelas 1
prob_class_0 = 1 - prob_class_1  # Probabilitas kelas 0 (karena hanya ada dua kelas)

# Menampilkan hasil
print(f"Probabilitas Kelas 0: {prob_class_0}")
print(f"Probabilitas Kelas 1: {prob_class_1}")

Probabilitas Kelas 0: 0.5
Probabilitas Kelas 1: 0.5
