In [13]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import sklearn
import joblib
import time
from string import punctuation

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, f1_score, plot_roc_curve, make_scorer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.utils.class_weight import compute_class_weight
import lightgbm as lgb
import nltk
from nltk.corpus import stopwords

tqdm.pandas()

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier


from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt

In [15]:
def model_knn():
    """
    Function for initiating Logistic Regression Model
    """

    base_model = KNeighborsClassifier()

    
    return base_model

def model_dt():
    """
    Function for initiating Random Forest Model
    """
    
    base_model = DecisionTreeClassifier(random_state=42)
    
    return base_model

def model_mlp():
    """
    Function for initiating LightGBM Model
    """

    base_model = MLPClassifier(random_state=42, max_iter=300)
    
    return base_model

In [16]:

def select_model(train_log_dict):
    max_score = max(train_log_dict['model_score'])
    max_index = train_log_dict['model_score'].index(max_score)
    best_model = train_log_dict['model_fit'][max_index]
    name = train_log_dict['model_name'][max_index]

    return best_model, name


In [23]:
def fit(X_train, Y_train, model):

    model_fitted = model.fit(X_train, Y_train)
    return model_fitted

def validate(X_test, Y_test, model_fitted, ):

    auc_score = roc_auc_score(Y_test, model_fitted.predict_proba(X_test))


    return auc_score

In [24]:
def main(X_train, X_test, Y_train,  Y_test):
    

    # Initiate models
    knn = model_knn
    dt = model_dt
    mlp = model_mlp
    
    # Initiate logs
    train_log_dict = {'model': [knn, dt, mlp],
                      'model_name': [],
                      'model_score': []}


    # Try Each models
    for model in train_log_dict['model']:
        base_model = model()
        train_log_dict['model_name'].append(base_model.__class__.__name__)
        print(f'Fitting {base_model.__class__.__name__}')

        # Train
        fitted_model = fit(X_train, Y_train, base_model)

        # Validate
        score = validate(X_test, Y_test, fitted_model)
        train_log_dict['model_score'].append(score)

    best_model, best_report, best_threshold, name = select_model(train_log_dict)
    
    print(
        f"Model: {name}, Score: {best_report['f1-score']['macro avg']}")
    joblib.dump(best_model, 'E:\\projects\\plds_latihan\\pipeline\\mantab_model.pkl')
    joblib.dump(best_threshold, 'E:\\projects\\plds_latihan\\pipeline\\threshold.pkl')
    joblib.dump(train_log_dict, 'E:\\projects\\plds_latihan\\pipeline\\train_log.pkl')
    print(f'\n {best_report}')
    
    return best_model


In [25]:
def load_dataset():

    X_train = joblib.load("E:\\projects\\plds_teamc\\data\\X_train.pkl")
    X_test = joblib.load("E:\\projects\\plds_teamc\\data\\X_test.pkl")
    Y_train = joblib.load("E:\\projects\\plds_teamc\\data\\Y_train.pkl")
    Y_test = joblib.load("E:\\projects\\plds_teamc\\data\\Y_test.pkl")

    return X_train, X_test, Y_train,  Y_test

In [194]:
X_train, X_test, Y_train,  Y_test= load_dataset()


In [70]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [195]:
# Build VT-CNN2 Neural Net model using Keras primitives -- 
#  - Reshape [N,2,128] to [N,1,2,128] on input
#  - Pass through 2 2DConv/ReLu layers
#  - Pass through 2 Dense layers (ReLu and Softmax)
#  - Perform categorical cross entropy optimization
import tensorflow.keras.models as models
import tensorflow.keras.callbacks
from tensorflow.keras.layers import BatchNormalization, Dropout

from tensorflow.keras.layers import PReLU
from tensorflow.keras.layers import MaxPooling2D 
from tensorflow.keras.initializers import Constant

model = models.Sequential()
#model.add(Reshape([1]+in_shp, input_shape=in_shp))
model.add(Dense(32, name="dense1", kernel_initializer="he_normal"))
model.add(Dense(64, name="dense2", kernel_initializer="he_normal"))
model.add(Dropout(0.1))
model.add(Dense(128, name="dense3", kernel_initializer="he_normal"))
model.add(Dense(256, name="dense4", kernel_initializer="he_normal"))
model.add(BatchNormalization())
model.add(Dense(512, name="dense5", kernel_initializer="he_normal"))
model.add(Dropout(0.2))
model.add(Dense(256, name="dense6", kernel_initializer="he_normal"))
model.add(Dense(128, name="dense7", kernel_initializer="he_normal"))
model.add(Dropout(0.1))
model.add(Dense(64, name="dense8", kernel_initializer="he_normal"))
model.add(Dense(32, name="dense9", kernel_initializer="he_normal"))
model.add(Dense(1, name="dense10", kernel_initializer="he_normal", activation = 'sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])




In [196]:
X_train = np.asarray(X_train).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)
Y_train = np.asarray(Y_train).astype(np.float32)
Y_test = np.asarray(Y_test).astype(np.float32)

In [197]:
nb_epoch = 50     # number of epochs to train on
batch_size = 512 # training batch size
filepath = 'E:\\projects\\plds_teamc\\data\\HARRY.h5'

history = model.fit(X_train,
    Y_train,
    batch_size=batch_size,
    epochs=nb_epoch,
    verbose=2,
    validation_data=(X_test,Y_test),
    shuffle=True)


# convert the history.history dict to a pandas DataFrame:   
# #https://stackoverflow.com/questions/41061457/keras-how-to-save-the-training-history-attribute-of-the-history-object  
hist_df = pd.DataFrame(history.history) 
hist_csv_file = 'E:\\projects\\plds_teamc\\data\\history.csv'
with open(hist_csv_file, mode='w') as f:
    hist_df.to_csv(f)

Epoch 1/50
456/456 - 9s - loss: 0.6545 - accuracy: 0.7230 - val_loss: 0.5147 - val_accuracy: 0.7402
Epoch 2/50
456/456 - 9s - loss: 0.5175 - accuracy: 0.7394 - val_loss: 0.5109 - val_accuracy: 0.7460
Epoch 3/50
456/456 - 9s - loss: 0.5154 - accuracy: 0.7401 - val_loss: 0.5096 - val_accuracy: 0.7440
Epoch 4/50
456/456 - 9s - loss: 0.5148 - accuracy: 0.7402 - val_loss: 0.5094 - val_accuracy: 0.7388
Epoch 5/50
456/456 - 8s - loss: 0.5138 - accuracy: 0.7410 - val_loss: 0.5105 - val_accuracy: 0.7464
Epoch 6/50
456/456 - 8s - loss: 0.5134 - accuracy: 0.7409 - val_loss: 0.5105 - val_accuracy: 0.7461
Epoch 7/50
456/456 - 8s - loss: 0.5133 - accuracy: 0.7409 - val_loss: 0.5111 - val_accuracy: 0.7470
Epoch 8/50
456/456 - 9s - loss: 0.5128 - accuracy: 0.7414 - val_loss: 0.5104 - val_accuracy: 0.7453
Epoch 9/50
456/456 - 9s - loss: 0.5126 - accuracy: 0.7415 - val_loss: 0.5088 - val_accuracy: 0.7449
Epoch 10/50
456/456 - 9s - loss: 0.5125 - accuracy: 0.7412 - val_loss: 0.5097 - val_accuracy: 0.7369

In [201]:
acc_threshold = hist_df["accuracy"].mean()
joblib.dump(acc_threshold, 'E:\\projects\\plds_teamc\\data\\acc_threshold.pkl')

['E:\\projects\\plds_teamc\\data\\acc_threshold.pkl']

In [65]:
base_model = KNeighborsClassifier(n_neighbors=50)
#base_model = DecisionTreeClassifier(random_state=42)

#base_model = MLPClassifier(hidden_layer_sizes=(100,300,150), random_state=42, max_iter=300)


model_fitted = base_model.fit(X_train, Y_train)

#auc_score = roc_auc_score(Y_test, model_fitted.predict(X_test))
auc_score = roc_auc_score(Y_test, model_fitted.predict(X_test))



In [186]:
X_train, X_test, Y_train,  Y_test= load_dataset()
text=X_train.iloc[6786]
text

Patient Age at Treatment                                         2
Total Number of Previous cycles, Both IVF and DI                 2
Total number of IVF pregnancies                                  0
Total number of live births - conceived through IVF              0
Type of Infertility - Female Primary                             0
Type of Infertility - Female Secondary                           0
Type of Infertility - Male Primary                               0
Type of Infertility - Male Secondary                             0
Type of Infertility -Couple Primary                              0
Type of Infertility -Couple Secondary                            0
Cause  of Infertility - Tubal disease                            0
Cause of Infertility - Ovulatory Disorder                        0
Cause of Infertility - Male Factor                               0
Cause of Infertility - Patient Unexplained                       0
Cause of Infertility - Endometriosis                          

In [202]:
#joblib.dump(model, 'E:\\projects\\plds_teamc\\data\\mantab_model.pkl')
model.save('E:\\projects\\plds_teamc\\data\\')

INFO:tensorflow:Assets written to: E:\projects\plds_teamc\data\assets


In [208]:
def main_predict(text, model, threshold):
    text = text[[text.columns[1]]]
    text = np.asarray(text).astype(np.float32).T
    text = text.reshape(-1, 30)
    code2rel = {0: 'Tidak berhasil', 1: 'Berhasil'}
    
    proba = model.predict(text)
    predict = 1 if proba > threshold else 0
    print(f"{code2rel[predict]}, dengan akurasi {proba}")

    return code2rel[predict], proba


    

In [204]:
model = keras.models.load_model('E:\\projects\\plds_teamc\\data\\')
threshold = joblib.load('E:\\projects\\plds_teamc\\data\\acc_threshold.pkl')

In [207]:
text = pd.read_csv("E:\\projects\\plds_teamc\\data\\sample.txt") 
#text = np.asarray(text).astype(np.float32)


In [209]:

proba = main_predict(text, model, threshold)

Tidak berhasil, dengan akurasi [[0.24795133]]
