In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import pickle  # Import pickle for saving objects

# Load the cleaned dataset
data = pd.read_excel(r'Cleaned_MODEL_DATA.xlsx', sheet_name='Sheet1')

# Display the first few rows of the DataFrame to get an overview
print("First few rows of the DataFrame:")
print(data.head())

# Display the column names and data types
print("\nColumn names and data types:")
print(data.dtypes)

# Get the shape of the DataFrame (number of rows and columns)
print("\nShape of the DataFrame (rows, columns):")
print(data.shape)

# Get a summary of the DataFrame
print("\nSummary statistics of the DataFrame:")
print(data.describe(include='all'))

# Check for missing values
print("\nMissing values in each column:")
print(data.isnull().sum())

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout



# Load the cleaned dataset
data = pd.read_excel(r'Cleaned_MODEL_DATA.xlsx', sheet_name='Sheet1')


# Define phenotype and genotype columns
antibiotic_columns = [
    'Amikacin_I', 'Amoxycillin clavulanate_I', 'Ampicillin_I', 'Cefepime_I', 'Ceftazidime_I', 'Ceftriaxone_I',
    'Imipenem_I', 'Levofloxacin_I', 'Meropenem_I', 'Minocycline_I', 'Piperacillin tazobactam_I', 'Tigecycline_I',
    'Ampicillin sulbactam_I', 'Aztreonam_I', 'Ceftaroline_I', 'Ceftazidime avibactam_I', 'Ciprofloxacin_I',
    'Colistin_I', 'Doripenem_I', 'Ertapenem_I', 'Gentamicin_I', 'Trimethoprim sulfa_I', 'Ceftolozane tazobactam_I',
    'Meropenem vaborbactam_I'
]
phenotype_columns = ['Source', 'Family', 'Species'] + antibiotic_columns
genotype_columns = [
    'AMPC', 'SHV', 'TEM', 'CTXM1', 'CTXM2', 'CTXM825', 'CTXM9', 'VEB', 'PER', 'GES', 'ACC', 'CMY1MOX', 'CMY11',
    'DHA', 'FOX', 'ACTMIR', 'KPC', 'OXA', 'NDM', 'IMP'
]


# Probabilistically fill missing values in antibiotic columns
for col in antibiotic_columns:
    probabilities = data[col].value_counts(normalize=True)
    if len(probabilities) > 0:
        data[col] = data[col].apply(lambda x: np.random.choice(probabilities.index, p=probabilities.values) if pd.isna(x) else x)



# Encode categorical features (Source, Family, Species)
label_encoders = {}
categorical_columns = ['Source', 'Family', 'Species']
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Save the label encoders to disk
for col, encoder in label_encoders.items():
    with open(f"{col}_encoder.pkl", 'wb') as f:
        pickle.dump(encoder, f)


# Encode antibiotic susceptibility status (Susceptible, Intermediate, Resistant)
antibiotic_mapping = {'Susceptible': 0, 'Intermediate': 1, 'Resistant': 2}
for col in antibiotic_columns:
    data[col] = data[col].map(antibiotic_mapping)

# Fill missing values in genotype columns probabilistically
for col in genotype_columns:
    probabilities = data[col].value_counts(normalize=True)
    if len(probabilities) > 0:
        data[col] = data[col].apply(lambda x: np.random.choice(probabilities.index, p=probabilities.values) if pd.isna(x) else x)


# Process genotype columns: replace gene names with 'Positive' and 'NEG' with 'Negative'
for col in genotype_columns:
    data[col] = data[col].apply(lambda x: 'Positive' if x != 'NEG' else 'Negative')

# Encode genotype columns
genotype_mapping = {'Negative': 0, 'Positive': 1}
for col in genotype_columns:
    data[col] = data[col].map(genotype_mapping)


# Split the data into features and target
X = data[phenotype_columns]
y = data[genotype_columns]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the StandardScaler on the training data
scaler = StandardScaler()
scaler.fit(X_train)  # Fit the scaler on the training data

# Save the fitted scaler
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)


# Save the feature names for later use
with open('feature_names.pkl', 'wb') as f:
    pickle.dump(X_train.columns.tolist(), f)

# Build the model
model = Sequential()
model.add(Dense(256, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(y_train.shape[1], activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])




# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Model Loss: {loss}')
print(f'Model Accuracy: {accuracy}')

# Save the model
model.save('model6.h5')



def predict_genotype(source, family, species, antibiotics):
    # Create a dictionary for the input data to ensure proper alignment
    input_data = pd.DataFrame({
        'Source': [source],
        'Family': [family],
        'Species': [species]
    })
    # Add antibiotic columns
    for i, antibiotic in enumerate(antibiotic_columns):
        input_data[antibiotic] = [antibiotics[i]]

    for col in categorical_columns:
        input_data[col] = label_encoders[col].transform(input_data[col])
    prediction = model.predict(input_data)
    return np.round(prediction).astype(int)

# Example usage
source = 'Urine'
family = 'Enterobacteriaceae'
species = 'Klebsiella pneumoniae'
antibiotics = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
predicted_genotype = predict_genotype(source, family, species, antibiotics)
print(f'Predicted Genotype: {predicted_genotype}')




First few rows of the DataFrame:
   Isolate Id Source              Family                Species   Amikacin_I  \
0     1000012  Urine  Enterobacteriaceae       Escherichia coli  Susceptible   
1     1000077  Urine  Enterobacteriaceae  Klebsiella pneumoniae    Resistant   
2     1000312  Urine  Enterobacteriaceae       Escherichia coli  Susceptible   
3     1000315  Blood  Enterobacteriaceae       Escherichia coli  Susceptible   
4     1000316  Urine  Enterobacteriaceae       Escherichia coli  Susceptible   

  Amoxycillin clavulanate_I Ampicillin_I    Cefepime_I Ceftazidime_I  \
0               Susceptible    Resistant     Resistant   Susceptible   
1              Intermediate    Resistant     Resistant     Resistant   
2              Intermediate    Resistant     Resistant     Resistant   
3                 Resistant    Resistant  Intermediate   Susceptible   
4              Intermediate    Resistant     Resistant     Resistant   

  Ceftriaxone_I  ...  ACC CMY1MOX CMY11  DHA  FOX ACT

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m1372/1372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.1915 - loss: 0.4199 - val_accuracy: 0.2869 - val_loss: 0.2250
Epoch 2/100
[1m1372/1372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.2731 - loss: 0.2363 - val_accuracy: 0.3014 - val_loss: 0.2098
Epoch 3/100
[1m1372/1372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.3332 - loss: 0.2193 - val_accuracy: 0.3140 - val_loss: 0.1977
Epoch 4/100
[1m1372/1372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.3702 - loss: 0.2088 - val_accuracy: 0.3600 - val_loss: 0.1881
Epoch 5/100
[1m1372/1372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.3982 - loss: 0.2015 - val_accuracy: 0.3443 - val_loss: 0.1850
Epoch 6/100
[1m1372/1372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.4097 - loss: 0.1971 - val_accuracy: 0.3826 - val_loss: 0.1824
Epoch 7/10



Model Loss: 0.16992084681987762
Model Accuracy: 0.47631195187568665
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
Predicted Genotype: [[0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]]
