In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv1D, MaxPooling1D
from sklearn.preprocessing import LabelEncoder
import joblib
import tensorflow as tf

# Set random seed for reproducibility
np.random.seed(42)

# Load the dataset
data = pd.read_excel("C:\\Users\\91934\\Desktop\\Aug Hackfest\\Genetic and clinical data.xlsx")

# Encode categorical variables
label_encoder = LabelEncoder()
data['genetic_changes_brca'] = label_encoder.fit_transform(data['genetic_changes_brca'])
data['genetic_changes_rad51'] = label_encoder.fit_transform(data['genetic_changes_rad51'])
data['sub_type'] = label_encoder.fit_transform(data['sub_type'])
data['stage'] = label_encoder.fit_transform(data['stage'])
data['family_history'] = label_encoder.fit_transform(data['family_history'])

# Feature Engineering
numeric_columns = data.select_dtypes(include=np.number).columns
data['mean_feature'] = data[numeric_columns].mean(axis=1)
data['std_feature'] = data[numeric_columns].std(axis=1)

# Split the dataset into features and target
X = data[['genetic_changes_brca', 'genetic_changes_rad51', 'sub_type', 'stage', 'family_history', 'mean_feature', 'std_feature']]
y = data['outlier'].replace({"Yes": 1, "No": 0})

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

n_samples = 1000  
n_features = 7 

X_random_0 = np.random.rand(n_samples // 2, n_features) * 0.5 
y_random_0 = np.zeros(n_samples // 2) 

X_random_1 = np.random.rand(n_samples // 2, n_features) * 0.5 + 0.25 
y_random_1 = np.ones(n_samples // 2)  

# Combine the features and labels
X_random = np.concatenate((X_random_0, X_random_1), axis=0)
y_random = np.concatenate((y_random_0, y_random_1), axis=0)

# Split the random data into training and test sets
X_train_random, X_test_random, y_train_random, y_test_random = train_test_split(X_random, y_random, test_size=0.3, random_state=42)

# Hyperparameter tuning for Decision Tree
dt_param_grid = {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]}
dt_grid_search = GridSearchCV(DecisionTreeClassifier(), dt_param_grid, cv=3)
dt_grid_search.fit(X_train_random, y_train_random)
dt_model_random = dt_grid_search.best_estimator_

# Train a CNN model with hyperparameter tuning
X_train_random_reshaped = X_train_random.reshape((X_train_random.shape[0], X_train_random.shape[1], 1))
X_test_random_reshaped = X_test_random.reshape((X_test_random.shape[0], X_test_random.shape[1], 1))

cnn_model_random = Sequential()
cnn_model_random.add(Conv1D(32, kernel_size=3, activation='relu', input_shape=(X_train_random.shape[1], 1)))
cnn_model_random.add(MaxPooling1D(pool_size=2))
cnn_model_random.add(Flatten())
cnn_model_random.add(Dense(1, activation='sigmoid'))

cnn_model_random.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model_random.fit(X_train_random_reshaped, y_train_random, epochs=10, batch_size=32, validation_data=(X_test_random_reshaped, y_test_random))

# Evaluate CNN model
cnn_loss_random, cnn_accuracy_random = cnn_model_random.evaluate(X_test_random_reshaped, y_test_random)

# Evaluate Decision Tree model
dt_accuracy_random = dt_model_random.score(X_test_random, y_test_random)

# Save the models
joblib.dump(dt_model_random, 'dt_model_random.joblib')
cnn_model_random.save('cnn_model_random.h5')

# Integration of decision tree and CNN for prediction
def predict_outlier_ensemble(input_data):
    dt_model = joblib.load('dt_model_random.joblib')
    cnn_model = tf.keras.models.load_model('cnn_model_random.h5')
    dt_prediction = dt_model.predict(input_data)[0]
    cnn_input = input_data.reshape(1, input_data.shape[1], 1)
    cnn_prediction = np.round(cnn_model.predict(cnn_input))[0][0]
    ensemble_prediction = np.mean([dt_prediction, cnn_prediction])
    return int(ensemble_prediction)

# Example prediction using ensemble
input_data_example = np.random.rand(1, n_features) * 0.5 + 0.25  # Generate random example data within the range [0.25, 0.75]
ensemble_prediction_example = predict_outlier_ensemble(input_data_example)
print("\033[1m" + f"Ensemble Prediction among Decision Tree and CNN: {ensemble_prediction_example}" + "\033[0m")


Epoch 1/10


  super().__init__(


[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.5206 - loss: 0.7051 - val_accuracy: 0.5600 - val_loss: 0.6888
Epoch 2/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5535 - loss: 0.6812 - val_accuracy: 0.5000 - val_loss: 0.6714
Epoch 3/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5176 - loss: 0.6620 - val_accuracy: 0.5200 - val_loss: 0.6540
Epoch 4/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5565 - loss: 0.6461 - val_accuracy: 0.5833 - val_loss: 0.6328
Epoch 5/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6287 - loss: 0.6224 - val_accuracy: 0.6867 - val_loss: 0.6070
Epoch 6/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7196 - loss: 0.5991 - val_accuracy: 0.7667 - val_loss: 0.5767
Epoch 7/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1mEnsemble Prediction among Decision Tree and CNN: 1[0m
