In [None]:
import pandas as pd
import jieba

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
print(stopwords.fileids())

['arabic', 'azerbaijani', 'basque', 'bengali', 'catalan', 'chinese', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hebrew', 'hinglish', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'turkish']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df = pd.read_csv('dataset/medical_cases.csv')
for column in df:
    # Replace missing values with a placeholder token
    df[column].fillna('未知', inplace=True)
# Tokenize multiple columns
columns_to_tokenize =  ['睡眠', '大便','小便','胃口','渴', '手足',	'頭身','汗','月經','其他','脈診', '望診','舌診','眼診',	'特殊診斷', '耳診','診斷']


#### BERT

In [None]:
from transformers import BertModel, BertTokenizer
import torch
import string
import numpy as np

In [None]:
"""
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertModel.from_pretrained('bert-base-chinese')

# Initialize a list to store embeddings
embeddings_dict = {col: [] for col in columns_to_tokenize}

batch_size = 100

for column in columns_to_tokenize:

    texts = df[column].tolist()
    embeddings_list = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        batch_no_punct = [text.translate(str.maketrans('', '', string.punctuation)) for text in batch_texts]
        stop_words = set(nltk.corpus.stopwords.words('chinese'))
        filtered_texts = [' '.join([word for word in nltk.word_tokenize(text) if word not in stop_words]) for text in batch_no_punct]
         # Tokenize the filtered text using BERT tokenizer
        encoded_input = tokenizer.batch_encode_plus(filtered_texts, max_length=512, padding='max_length', truncation=True, return_tensors='pt')
        # Obtain the embeddings
        with torch.no_grad():
            outputs = model(**encoded_input)

        # Get the embeddings of the [CLS] token (the first token)
        text_embedding = outputs.last_hidden_state[:, 0, :].numpy()
        # Append the embeddings to the list
        embeddings_list.append(text_embedding)

    embeddings_dict[column] = np.concatenate(embeddings_list, axis=0)

# Create df from embedding
df_embeddings = pd.DataFrame.from_dict({key: np.array(value).tolist() for key, value in embeddings_dict.items()})
"""

"\n# Load tokenizer and model\ntokenizer = BertTokenizer.from_pretrained('bert-base-chinese')\nmodel = BertModel.from_pretrained('bert-base-chinese')\n\n# Initialize a list to store embeddings\nembeddings_dict = {col: [] for col in columns_to_tokenize}\n\nbatch_size = 100\n\nfor column in columns_to_tokenize:\n    \n    texts = df[column].tolist()\n    embeddings_list = []\n    \n    for i in range(0, len(texts), batch_size):\n        batch_texts = texts[i:i+batch_size]\n        batch_no_punct = [text.translate(str.maketrans('', '', string.punctuation)) for text in batch_texts]\n        stop_words = set(nltk.corpus.stopwords.words('chinese'))\n        filtered_texts = [' '.join([word for word in nltk.word_tokenize(text) if word not in stop_words]) for text in batch_no_punct]\n         # Tokenize the filtered text using BERT tokenizer\n        encoded_input = tokenizer.batch_encode_plus(filtered_texts, max_length=512, padding='max_length', truncation=True, return_tensors='pt')\n        

In [None]:
# Replace 'your_file_path' with the path to your pickle file
df_pkl = pd.read_pickle('df_embed.pkl')


In [None]:
# Function to perform average pooling on a vector
def average_pooling(vector):
    return np.mean(vector, axis=0)

for col in df_pkl.columns:
    df_pkl[col] = df_pkl[col].apply(average_pooling)

df_pkl.head()

Unnamed: 0,睡眠,大便,小便,胃口,渴,手足,頭身,汗,月經,其他,脈診,望診,舌診,眼診,特殊診斷,耳診,診斷
0,-0.005378,-0.005835,-0.005781,-0.004479,-0.005303,-0.005857,-0.006673,-0.006673,-0.006673,-0.004752,-0.005821,-0.006673,-0.0065,-0.005653,-0.006673,-0.004679,-0.005885
1,-0.005988,-0.005155,-0.004955,-0.006673,-0.006673,-0.00562,-0.006673,-0.006673,-0.006673,-0.005058,-0.006673,-0.006673,-0.004583,-0.006749,-0.006673,-0.006673,-0.005185
2,-0.007593,-0.005061,-0.005891,-0.005214,-0.004871,-0.006159,-0.006673,-0.005673,-0.006673,-0.00568,-0.00321,-0.006673,-0.004583,-0.006673,-0.006673,-0.006673,-0.004606
3,-0.006411,-0.006021,-0.005456,-0.006673,-0.005069,-0.004503,-0.006673,-0.006673,-0.006673,-0.006537,-0.004993,-0.006673,-0.007379,-0.006673,-0.006673,-0.006673,-0.00635
4,-0.005533,-0.005675,-0.004985,-0.004843,-0.00449,-0.006673,-0.005614,-0.004302,-0.006673,-0.004772,-0.004229,-0.006673,-0.006337,-0.005798,-0.006673,-0.006673,-0.00661


minmax scale using sklearn

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Create a StandardScaler object
scaler = MinMaxScaler()
# Apply Z-score normalization to each column containing vectors
    # Fit and transform each column's data using StandardScaler
scaled_data = scaler.fit_transform(df_pkl)

# Create a new DataFrame with the scaled data
df_scaled = pd.DataFrame(scaled_data, columns=df_pkl.columns)

df_scaled.head()

Unnamed: 0,睡眠,大便,小便,胃口,渴,手足,頭身,汗,月經,其他,脈診,望診,舌診,眼診,特殊診斷,耳診,診斷
0,0.516212,0.534519,0.336267,0.766479,0.516675,0.545134,0.273016,0.318367,0.18294,0.63924,0.351006,0.032707,0.347733,0.535022,0.141022,0.963181,0.408464
1,0.403403,0.674728,0.533405,0.124094,0.186676,0.590124,0.273016,0.318367,0.18294,0.588007,0.181971,0.032707,0.771303,0.317358,0.141022,0.30832,0.515476
2,0.106451,0.693951,0.309842,0.551085,0.620606,0.487817,0.273016,0.553165,0.18294,0.483565,0.869128,0.032707,0.771303,0.332448,0.141022,0.30832,0.603997
3,0.325122,0.496165,0.413697,0.124094,0.572864,0.80185,0.273016,0.318367,0.18294,0.339882,0.515369,0.032707,0.153719,0.332448,0.141022,0.30832,0.337405
4,0.48748,0.567533,0.526346,0.65981,0.712527,0.390382,0.483618,0.875251,0.18294,0.635907,0.666906,0.032707,0.383889,0.506147,0.141022,0.30832,0.29757


z-score using scipy.stats

In [None]:
# from scipy.stats import zscore

# df_pkl_z = pd.DataFrame()
# for col in df_pkl.columns:
#     df_pkl_z[col] = zscore(df_pkl[col])

# df_pkl_z.head()

### Naive Bayes

In [None]:
"""
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
df = pd.read_csv('dataset/medical_cases.csv')
y_label = df['白芍']
# Assuming y_label is your target labels for classification
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_pkl_z, y_label, test_size=0.2, random_state=42)

# Initialize and fit the Naive Bayes classifier
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)


# Predict on the test data
predictions = nb_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')
conf_matrix = confusion_matrix(y_test, predictions)

# Print or use the evaluation metrics as needed
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix: {conf_matrix}")
"""

'\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix\ndf = pd.read_csv(\'dataset/medical_cases.csv\')\ny_label = df[\'白芍\']\n# Assuming y_label is your target labels for classification\n# Split the data into training and testing sets\nX_train, X_test, y_train, y_test = train_test_split(df_pkl_z, y_label, test_size=0.2, random_state=42)\n\n# Initialize and fit the Naive Bayes classifier\nnb_classifier = GaussianNB()\nnb_classifier.fit(X_train, y_train)\n\n\n# Predict on the test data\npredictions = nb_classifier.predict(X_test)\n\n# Evaluate the classifier\naccuracy = accuracy_score(y_test, predictions)\nprecision = precision_score(y_test, predictions, average=\'weighted\')\nrecall = recall_score(y_test, predictions, average=\'weighted\')\nf1 = f1_score(y_test, predictions, average=\'weighted\')\nconf_matrix = confusion_matrix(y_test, pred

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
df = pd.read_csv('dataset/medical_cases.csv')
y_label = df['白芍']
# Assuming y_label is your target labels for classification
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_scaled, y_label, test_size=0.2, random_state=42)

# Initialize and fit the Naive Bayes classifier
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)


# Predict on the test data
predictions = nb_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)

# Print or use the evaluation metrics as needed
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix: {conf_matrix}")

Accuracy: 0.6
Precision: 0.5975609756097561
Recall: 0.6125
F1 Score: 0.6049382716049384
Confusion Matrix: [[47 33]
 [31 49]]


### Neural Network

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout

# Create a Sequential model
model = Sequential()

# Add layers to the model
model.add(Flatten(input_shape=(X_train.shape[1],)))  # Input layer
model.add(Dense(128, activation='relu'))  # Hidden layer with 128 neurons and ReLU activation
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))  # Hidden layer with 64 neurons and ReLU activation
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Output layer with 1 neuron for binary classification


# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1)
# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Evaluate the model on test data
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype("int32")  # Convert probabilities to classes
accuracy = accuracy_score(y_test, y_pred_classes)

print(f"Test Accuracy: {accuracy}")




Epoch 1/20


  return t[start:end]




Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Accuracy: 51.25%
Test Accuracy: 0.5125


### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Assuming df_pkl contains your feature vectors and y_label is the target labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_scaled, y_label, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust n_estimators and other hyperparameters

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Predict on the test data
predictions = rf_classifier.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')
conf_matrix = confusion_matrix(y_test, predictions)

# Print or use the evaluation metrics as needed
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix: {conf_matrix}")


Accuracy: 0.58125
Precision: 0.591694586492682
Recall: 0.58125
F1 Score: 0.5689759157251416
Confusion Matrix: [[33 47]
 [20 60]]
