In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
import pickle

# Load the cleaned dataset
data = pd.read_excel(r'Cleaned_MODEL_DATA.xlsx', sheet_name='Sheet1')

# Define phenotype and genotype columns
antibiotic_columns = [
    'Amikacin_I', 'Amoxycillin clavulanate_I', 'Ampicillin_I', 'Cefepime_I', 'Ceftazidime_I', 'Ceftriaxone_I',
    'Imipenem_I', 'Levofloxacin_I', 'Meropenem_I', 'Minocycline_I', 'Piperacillin tazobactam_I', 'Tigecycline_I',
    'Ampicillin sulbactam_I', 'Aztreonam_I', 'Ceftaroline_I', 'Ceftazidime avibactam_I', 'Ciprofloxacin_I',
    'Colistin_I', 'Doripenem_I', 'Ertapenem_I', 'Gentamicin_I', 'Trimethoprim sulfa_I', 'Ceftolozane tazobactam_I',
    'Meropenem vaborbactam_I'
]
phenotype_columns = ['Source', 'Family', 'Species'] + antibiotic_columns
genotype_columns = [
    'AMPC', 'SHV', 'TEM', 'CTXM1', 'CTXM2', 'CTXM825', 'CTXM9', 'VEB', 'PER', 'GES', 'ACC', 'CMY1MOX', 'CMY11',
    'DHA', 'FOX', 'ACTMIR', 'KPC', 'OXA', 'NDM', 'IMP'
]

# Probabilistically fill missing values in antibiotic columns
for col in antibiotic_columns:
    probabilities = data[col].value_counts(normalize=True)
    if len(probabilities) > 0:
        data[col] = data[col].apply(lambda x: np.random.choice(probabilities.index, p=probabilities.values) if pd.isna(x) else x)

# Encode categorical features (Source, Family, Species)
label_encoders = {}
categorical_columns = ['Source', 'Family', 'Species']
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Save the label encoders to disk
for col, encoder in label_encoders.items():
    with open(f"{col}_encoder.pkl", 'wb') as f:
        pickle.dump(encoder, f)

# Encode antibiotic susceptibility status (Susceptible, Intermediate, Resistant)
antibiotic_mapping = {'Susceptible': 0, 'Intermediate': 1, 'Resistant': 2}
for col in antibiotic_columns:
    data[col] = data[col].map(antibiotic_mapping)

# Fill missing values in genotype columns probabilistically
for col in genotype_columns:
    probabilities = data[col].value_counts(normalize=True)
    if len(probabilities) > 0:
        data[col] = data[col].apply(lambda x: np.random.choice(probabilities.index, p=probabilities.values) if pd.isna(x) else x)

# Process genotype columns: replace gene names with 'Positive' and 'NEG' with 'Negative'
for col in genotype_columns:
    data[col] = data[col].apply(lambda x: 'Positive' if x != 'NEG' else 'Negative')

# Encode genotype columns
genotype_mapping = {'Negative': 0, 'Positive': 1}
for col in genotype_columns:
    data[col] = data[col].map(genotype_mapping)

# Split the data into features and target
X = data[phenotype_columns]
y = data[genotype_columns]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the StandardScaler on the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Save the fitted scaler
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Build the model
model = Sequential()
model.add(Dense(256, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(y_train.shape[1], activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2)

# Evaluate the model
X_test_scaled = scaler.transform(X_test)
loss, accuracy = model.evaluate(X_test_scaled, y_test)
print(f'Model Loss: {loss}')
print(f'Model Accuracy: {accuracy}')

# Save the model
model.save('model6.h5')

# Function for predicting genotype
def predict_genotype(source, family, species, antibiotics):
    # Create a DataFrame for input data
    input_data = pd.DataFrame({
        'Source': [source],
        'Family': [family],
        'Species': [species]
    })

    # Add antibiotic columns
    for i, antibiotic in enumerate(antibiotic_columns):
        input_data[antibiotic] = [antibiotics[i]]

    # Apply label encoding
    for col in categorical_columns:
        input_data[col] = label_encoders[col].transform(input_data[col])

    # Apply the scaler to the input data
    input_data_scaled = scaler.transform(input_data)

    # Predict using the trained model
    prediction = model.predict(input_data_scaled)

    return np.round(prediction).astype(int)

# Example usage
source = 'Urine'
family = 'Enterobacteriaceae'
species = 'Klebsiella pneumoniae'
antibiotics = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
predicted_genotype = predict_genotype(source, family, species, antibiotics)
print(f'Predicted Genotype: {predicted_genotype}')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m1098/1098[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.2267 - loss: 0.2895 - val_accuracy: 0.3199 - val_loss: 0.1953
Epoch 2/100
[1m1098/1098[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.3353 - loss: 0.2087 - val_accuracy: 0.3631 - val_loss: 0.1835
Epoch 3/100
[1m1098/1098[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.3868 - loss: 0.1979 - val_accuracy: 0.4720 - val_loss: 0.1768
Epoch 4/100
[1m1098/1098[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.4434 - loss: 0.1905 - val_accuracy: 0.5222 - val_loss: 0.1739
Epoch 5/100
[1m1098/1098[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.4686 - loss: 0.1865 - val_accuracy: 0.5269 - val_loss: 0.1726
Epoch 6/100
[1m1098/1098[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.4670 - loss: 0.1826 - val_accuracy: 0.5491 - val_loss: 0.1705
Epoch 7/10



Model Loss: 0.16672861576080322
Model Accuracy: 0.44679298996925354
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
Predicted Genotype: [[0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]]
