In [6]:
from chembl_webresource_client.new_client import new_client
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski
from rdkit.Chem import AllChem

import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from sklearn.model_selection import RandomizedSearchCV

import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
#from sklearn.metrics import mean_squared_error, r2_score

from sklearn.metrics import accuracy_score
 
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler


## Retreiving Data

In [7]:
def search_target(protein_name: str) -> pd.DataFrame:
    """
    Search for target proteins in the ChEMBL database by protein name.
    
    Parameters:
    - protein_name (str): The name of the protein target to search for (e.g., 'coronavirus').
    
    Returns:
    - pd.DataFrame: A DataFrame containing the list of matching target proteins from ChEMBL.
    """
    # Connect to the ChEMBL API
    target = new_client.target
    
    # Search for the target proteins
    target_query = target.search(protein_name)
    
    # Convert the search results into a pandas DataFrame for easy viewing
    targets = pd.DataFrame.from_dict(target_query)
    
    # Filter the DataFrame for human proteins only (organism: 'Homo sapiens')
    human_targets = targets[targets['organism'] == 'Homo sapiens']
    
    return human_targets

# Example usage:
#rotein_name = input("Enter the protein name to search for (e.g., 'coronavirus'): ")
#targets = search_target(protein_name)
#targets

import pandas as pd
from chembl_webresource_client.new_client import new_client

def get_bioactivity_data_for_target(targets: pd.DataFrame, target_index: int, bioactivity_type: str = "IC50") -> pd.DataFrame:
    """
    Retrieves bioactivity data for a selected target protein from ChEMBL.
    
    Parameters:
    - targets (pd.DataFrame): The DataFrame containing the list of target proteins from ChEMBL.
    - target_index (int): The index of the selected target.
    - bioactivity_type (str): The type of bioactivity to filter for (e.g., "IC50").
    
    Returns:
    - df (pd.DataFrame): A DataFrame containing the bioactivity data for the selected target.
    """
    # Step 4: Get the target selected by the user based on index
    if target_index >= len(targets) or target_index < 0:
        print(f"Invalid index! Please select an index between 0 and {len(targets) - 1}.")
        return None
    
    # Get the selected target's ChemBL ID
    selected_target = targets.target_chembl_id[target_index]
    
    # Step 5: Retrieve the bioactivity data for the selected target
    activity = new_client.activity
    res = activity.filter(target_chembl_id=selected_target).filter(standard_type=bioactivity_type)[:400]
    
    # Convert results to DataFrame
    df = pd.DataFrame.from_dict(res)
    
    # Step 6: Clean data by removing rows with missing bioactivity values
    df2 = df[df.standard_value.notna()]

    # Step 7: Filter rows where standard_units is 'nM'
    df2 = df2[df2['standard_units'] == 'nM']
    
    # Step 8: Save data to CSV for further analysis
    df2.to_csv(f'bioactivity_data.csv', index=False)
    
    # Return the cleaned DataFrame for further use
    return df2

# Example usage:
#target_index = int(input("Enter the index number of the target you want to work with: "))
#bioactivity_df = get_bioactivity_data_for_target(targets, target_index)
#bioactivity_df

## Bioactivity Data Preprocessing

def preprocess_bioactivity_data(bioactivity_df: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocesses bioactivity data to classify the bioactivity and select necessary columns.
    
    Parameters:
    - bioactivity_df (pd.DataFrame): A DataFrame containing bioactivity data (e.g., from ChEMBL).
    
    Returns:
    - pd.DataFrame: A DataFrame with the preprocessed bioactivity data, including classification.
    """
    # Ensure standard_value is numeric
    bioactivity_df['standard_value'] = pd.to_numeric(bioactivity_df['standard_value'], errors='coerce')

    # Define the classification function
    def classify_bioactivity(value):
        if value >= 10000:
            return "inactive"
        elif value <= 1000:
            return "active"
        else:
            return "intermediate"

    # Apply the classification function to the standard_value column
    bioactivity_class = bioactivity_df['standard_value'].apply(classify_bioactivity)

    # Select necessary columns
    selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']
    df3 = bioactivity_df[selection]

    # Convert bioactivity_class into a pandas Series with the column name 'bioactivity_class'
    bioactivity_class_series = pd.Series(bioactivity_class, name='bioactivity_class')

    # Reset the index of both df3 and the bioactivity_class_series to ensure they align properly
    df3_reset = df3.reset_index(drop=True)
    bioactivity_class_series_reset = bioactivity_class_series.reset_index(drop=True)

    # Concatenate df3 with the new Series (now both have the same index)
    df3_final = pd.concat([df3_reset, bioactivity_class_series_reset], axis=1)
    
   

    # Save the preprocessed data to a CSV file
    df3_final.to_csv('bioactivity_preprocessed_data.csv', index=False)
    
    
    # Return the final preprocessed DataFrame
    return df3_final

# Example usage:
# bioactivity_df = pd.read_csv('path_to_bioactivity_data.csv')  # or use the output from get_bioactivity_data_for_target()
#preprocessed_df = preprocess_bioactivity_data(bioactivity_df)
#preprocessed_df.tail()



import numpy as np
import pandas as pd

def normalize_ic50_values(input: pd.DataFrame) -> pd.DataFrame:
    """
    Normalizes the IC50 values to avoid extremely large values and handles missing data.

    Parameters:
    - input (pd.DataFrame): The DataFrame containing bioactivity data with IC50 values.

    Returns:
    - pd.DataFrame: The input DataFrame with normalized IC50 values.
    """
    input['standard_value_norm'] = input['standard_value'].clip(upper=100000000)  # Clip values to avoid extremely large IC50 values
    input['standard_value_norm'] = input['standard_value_norm'].fillna(input['standard_value_norm'].median())  # Handle missing values
    return input.drop('standard_value', axis=1)

def convert_to_pIC50(input: pd.DataFrame) -> pd.DataFrame:
    """
    Converts the IC50 values to pIC50 values.

    Parameters:
    - input (pd.DataFrame): The DataFrame containing normalized IC50 values.

    Returns:
    - pd.DataFrame: The DataFrame with pIC50 values.
    """
    # Avoid calculating log for zero or negative values
    input['pIC50'] = input['standard_value_norm'].apply(lambda x: -np.log10(x * 10**-9) if x > 0 else np.nan)
    return input.drop('standard_value_norm', axis=1)

def process_ic50_to_pIC50(input: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocesses the IC50 values by normalizing them and then converting to pIC50 values.

    Parameters:
    - input (pd.DataFrame): The DataFrame containing bioactivity data with IC50 values.

    Returns:
    - pd.DataFrame: The processed DataFrame with pIC50 values.
    """
    df_norm = normalize_ic50_values(input)
    df_pIC50 = convert_to_pIC50(df_norm)
    return df_pIC50

# Example usage:
#df3 = preprocessed_df  # Or the output from the previous steps
#df_pIC50 = process_ic50_to_pIC50(df3)
#df_pIC50


def lipinski_and_fingerprints_from_df(df, smiles_column='canonical_smiles'):
    """
    Function to calculate Lipinski descriptors and Morgan fingerprints for molecules in a DataFrame.
    
    Parameters:
    - df (pd.DataFrame): A DataFrame containing a column of SMILES strings.
    - smiles_column (str): The column name in the DataFrame that contains SMILES strings. Default is 'canonical_smiles'.
    
    Returns:
    - pd.DataFrame: A DataFrame containing Lipinski descriptors and Morgan fingerprints.
    """
    
    mol_data = []
    morgan_fingerprints = []
    
    # Loop over each SMILES in the DataFrame
    for smiles in df[smiles_column]:
        try:
            # Ensure SMILES is valid
            mol = Chem.MolFromSmiles(smiles)
            if mol is None:
                print(f"Warning: Invalid SMILES string skipped: {smiles}")
                continue
            
            # Lipinski Descriptors
            desc_MolWt = Descriptors.MolWt(mol)
            desc_MolLogP = Descriptors.MolLogP(mol)
            desc_NumHDonors = Lipinski.NumHDonors(mol)
            desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)
            
            # Add the Lipinski descriptors to the data list
            row = np.array([desc_MolWt, desc_MolLogP, desc_NumHDonors, desc_NumHAcceptors])
            mol_data.append(row)
            
            # Morgan Fingerprints (ECFP)
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
            morgan_fingerprints.append(np.array(fp))

        except Exception as e:
            # Print the error message and skip the invalid molecule
            print(f"Error processing SMILES {smiles}: {e}")
            continue
    
    # Convert lists to DataFrames
    descriptors_df = pd.DataFrame(mol_data, columns=["MW", "LogP", "NumHDonors", "NumHAcceptors"])
    fingerprints_df = pd.DataFrame(morgan_fingerprints)
    
    # Combine the descriptors and fingerprints into one DataFrame
    final_df = pd.concat([df, descriptors_df, fingerprints_df], axis=1)
    
    return final_df

# Example usage

# Assuming df_input is the preprocessed DataFrame with the 'canonical_smiles' column
#df_input = pd.read_csv('bioactivity_preprocessed_data.csv')  # Replace with your actual data loading step
#df_input = df_input.dropna()  # Ensure no missing data in the SMILES column

# Generate the descriptors and fingerprints for each molecule in the DataFrame
#final_df = lipinski_and_fingerprints_from_df(df_input)

# Show the final DataFrame (first few rows for inspection)
#final_df

def filter_bioactivity_class(df: pd.DataFrame, class_column: str = 'bioactivity_class') -> pd.DataFrame:
    """
    Filters out rows with 'intermediate' in the specified bioactivity class column.

    Parameters:
    - df (pd.DataFrame): The input DataFrame to filter.
    - class_column (str): The name of the column containing bioactivity class (default is 'bioactivity_class').

    Returns:
    - pd.DataFrame: A filtered DataFrame with rows where the bioactivity class is not 'intermediate'.
    """
    # Filter out rows where the bioactivity_class is 'intermediate'
    df_filtered = df[df[class_column] != 'intermediate']
    
    # Optionally drop rows with NaN in the bioactivity class column
    df_filtered = df_filtered.dropna(subset=[class_column])
    
    return df_filtered
#df_filtered= filter_bioactivity_class(final_df)
#df_filtered


def prepare_training_data(df: pd.DataFrame, drop_columns: list = ['molecule_chembl_id', 'canonical_smiles', 'bioactivity_class'], output_file: str = 'bioactivity_class_data_for_model.csv') -> pd.DataFrame:
    """
    Prepares the DataFrame for training by dropping specified columns and saving the cleaned DataFrame to a CSV file.

    Parameters:
    - df (pd.DataFrame): The input DataFrame containing the data.
    - drop_columns (list): A list of column names to drop from the DataFrame. Default is ['molecule_chembl_id', 'canonical_smiles', 'bioactivity_class'].
    - output_file (str): The path to save the cleaned DataFrame as a CSV file. Default is 'bioactivity_class_data_for_model.csv'.

    Returns:
    - pd.DataFrame: The cleaned DataFrame ready for training.
    """
    # Drop the specified columns
    df_only_input = df.drop(columns=drop_columns)
    
    # Save the cleaned DataFrame to a CSV file
    df_only_input.to_csv(output_file, index=False)
    
    df_only_input.to_csv('bioactivity_class_data_for_model.csv', index=False)
    
    # Return the cleaned DataFrame for further use
    return df_only_input
#df_only_input=prepare_training_data(df_filtered)
#df_only_input


import joblib
# Function to calculate custom accuracy (within 10% of true values)
def calculate_custom_accuracy(y_true, y_pred, threshold=0.10):
    # Calculate the percentage error
    error = abs(y_true - y_pred) / y_true
    
    # Count how many predictions are within the threshold (10%)
    correct_predictions = sum(error <= threshold)
    
    # Calculate the custom accuracy as the ratio of correct predictions
    custom_accuracy = correct_predictions / len(y_true)
    
    return custom_accuracy

def train_and_save_model(data_path):
    try:
        # Step 1: Load the dataset
        df = pd.read_csv(data_path)
        
        # Step 2: Split the data into features and target
        X = df.drop('pIC50', axis=1)  # Features
        y = df.pIC50  # Target variable (pIC50)

        # Step 3: Train-test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Step 4: Scale the features using StandardScaler
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)  # Apply the same scaling to test data
        
        # Step 5: Train the model (RandomForestRegressor)
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train_scaled, y_train)
        
        # Step 6: Save the model, scaler, and selected features (columns)
        joblib.dump(model, 'bioactivity_predictor_model.pkl')
        joblib.dump(scaler, 'scaler.pkl')
        
        # Save the selected columns (features) used for training
        selected_columns = X.columns.tolist()
        joblib.dump(selected_columns, 'selected_columns.pkl')
        
        # Step 7: Evaluate the model
        y_pred = model.predict(X_test_scaled)
        
        # Calculate metrics
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        custom_accuracy = calculate_custom_accuracy(y_test, y_pred, threshold=0.20)
        
        performance_dict = {
            "R²": [r2],
            "Mean Absolute Error (MAE)": [mae],
            "Mean Squared Error (MSE)": [mse],
            "Custom Accuracy (within 20% threshold)": [custom_accuracy]
        }

        # Convert dictionary to DataFrame
        performance_df = pd.DataFrame(performance_dict)
        
        # Return the trained model, scaler, and performance metrics
        return model, scaler, performance_df

    except Exception as e:
        print(f"Error occurred: {e}")
        return None, None, None  # Return None for all three values in case of an error


# Example usage:
# Call the function with the path to the preprocessed data CSV
#model, scaler = train_and_save_model('bioactivity_class_data_for_model.csv')

# If the model is trained successfully
#if model and scaler:
#    print("Model trained and saved!")

import pandas as pd
import joblib
import numpy as np

def predict_new_data_from_csv(input_file_path):
    try:
        # Load the new dataset (user input)
        new_data_df = pd.read_csv(input_file_path)
        
        # Assuming 'lipinski_and_fingerprints_from_df' is already defined earlier to process the SMILES and compute fingerprints
        new_data_df = lipinski_and_fingerprints_from_df(new_data_df)

        # Save the canonical smiles before dropping them
        canonical_smiles = new_data_df['canonical_smiles']

        # Drop columns that are not needed for prediction
        new_data_input = new_data_df.drop(columns=['canonical_smiles'])
        
        # Set the features (X)
        X = new_data_input

        # Load the pre-trained model, scaler, and selected columns
        model = joblib.load('bioactivity_predictor_model.pkl')
        scaler = joblib.load('scaler.pkl')
        selected_columns = joblib.load('selected_columns.pkl')

        # Convert the column names of X to strings
        X.columns = X.columns.astype(str)

        # Reorder the columns to match the order of selected columns used during training
        X = X[selected_columns]

        # Check for missing values
        if X.isnull().any().any():
            print("Warning: Missing values found in input data. You may need to handle them before prediction.")

        # Scale the new input data using the scaler from training
        X_scaled = scaler.transform(X)

        # Make predictions (pIC50)
        pIC50_predictions = model.predict(X_scaled)

        # Classify whether each compound is active or inactive (threshold for activity is pIC50 >= 5)
        activity_threshold = 5  # Activity threshold
        activity_predictions = np.where(pIC50_predictions >= activity_threshold, 'Active', 'Inactive')

        # Create a result DataFrame including canonical_smiles
        result_df = pd.DataFrame({
            'Canonical SMILES': canonical_smiles,
            'Predicted pIC50': pIC50_predictions,
            'Predicted Activity': activity_predictions
        })

        # Save results to a CSV file
        result_df.to_csv('result.csv', index=False)

        # Return the result dataframe for inspection
        return result_df

    except Exception as e:
        print(f"Error occurred: {e}")
        return None

# Example usage:
# Call the function with the path to your test data CSV file
#result_df = predict_new_data_from_csv('Test_data_3-1.csv')

# Display the results if available
#result_df


from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdmolfiles
from rdkit.Chem import Draw
from IPython.display import Image, display
import os

def smiles_to_3D_and_2D(smiles):
    """
    Convert a SMILES string into both 3D and 2D molecular structures.
    - Save the 3D structure as a PDB file.
    - Display the 2D structure in the output cell.
    
    Parameters:
    - smiles (str): The SMILES string representing the molecule.
    """
    # Step 1: Convert the SMILES string to a molecule
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError("Invalid SMILES string")

    # Step 2: Generate 3D coordinates using RDKit's EmbedMolecule function
    mol_3d = Chem.AddHs(mol)  # Add hydrogen atoms for 3D embedding
    AllChem.EmbedMolecule(mol_3d, randomSeed=42)  # Generate 3D coordinates

    # Step 3: Minimize the energy of the structure (optional but recommended)
    AllChem.MMFFOptimizeMolecule(mol_3d)

    # Step 4: Ask the user for the file name to save the PDB file
    pdb_filename = input("Enter the name for the PDB file (e.g., 'molecule_3D.pdb'): ")

    # Ensure the file name ends with '.pdb'
    if not pdb_filename.endswith('.pdb'):
        pdb_filename += '.pdb'

    # Step 5: Save the molecule to a PDB file
    rdmolfiles.MolToPDBFile(mol_3d, pdb_filename)
    print(f"3D structure saved as PDB: {pdb_filename}")

    # Step 6: Generate 2D image using RDKit's Draw module
    img = Draw.MolToImage(mol, size=(300, 300))

    # Step 7: Save the 2D structure as a PNG image
    #img_filename = input("Enter the name for the 2D image file (e.g., 'molecule_2D.png'): ")

    # Ensure the file name ends with '.png'
    #if not img_filename.endswith('.png'):
      #  img_filename += '.png'

    # Step 8: Save the image
    #img.save(img_filename)
    #print(f"2D structure saved as image: {img}")

    # Step 9: Display the 2D image inline (for Jupyter Notebook or IPython environment)
    print('2D image of the molecule')
    display(img)

# Example Usage
#smiles = input("Enter the SMILES string of the molecule: ")  # Get SMILES input from the user
#smiles_to_3D_and_2D(smiles)


def main_function():
    
    bioactivity_df=pd.read_csv('bioactivity_data.csv')
    
    #calling the function to preprocess hte bioactivity data
    preprocessed_df = preprocess_bioactivity_data(bioactivity_df)
    
    #calling the function to conver IC50 to pIC50
    df_pIC50 = process_ic50_to_pIC50(preprocessed_df)
    
    #calling the function to calculate the descriptors and fingerprint
    df_input = df_pIC50
    df_input=df_input.dropna()
    
    final_df = lipinski_and_fingerprints_from_df(df_input)
    
    #filtering intermediate classes
    df_filtered= filter_bioactivity_class(final_df)
    
    #preparing training data
    df_only_input=prepare_training_data(df_filtered)
    
    #training the model
    model, scaler, performance_df = train_and_save_model('bioactivity_class_data_for_model.csv')
    
    if model and scaler:
        print("Model trained and saved!")
    return model, scaler, performance_df
    

#main_function()





## GUI

In [8]:
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
import pandas as pd
import joblib
import numpy as np
from chembl_webresource_client.new_client import new_client
from rdkit import Chem
from rdkit.Chem import AllChem, rdmolfiles, Draw
from PIL import Image, ImageTk
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import os
import threading
import time



class BioactivityModelingApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Smiler")
        self.root.geometry("800x600")  # Set window size
        self.bg_color = "#F0F0F0"  # Light background color

        # Initially display the front page
        self.front_page()

    def front_page(self):
        # Create the front page
        self.front_frame = tk.Frame(self.root, bg=self.bg_color)
        self.front_frame.pack(expand=True, fill="both")

        title_label = tk.Label(
            self.front_frame,
            text="Welcome to Smiler \n◝(ᵔᵕᵔ)◜",
            font=("Helvetica", 24, "bold"),
            bg=self.bg_color,
            fg="Black",
        )
        title_label.pack(pady=(250, 20))

        description_label = tk.Label(
            self.front_frame,
            text="Smiler is a tool designed bring smiles to the face of users and help them identify bioactivity of unknown smiles for analysis!",
            font=("Helvetica", 14),
            bg=self.bg_color,
            fg="#008080",
            wraplength=750,  # Wrap text within a width of 750 px
            justify="center"  # Center align the text
        )
        description_label.pack(pady=10)  # Add some padding
        
        start_button = tk.Button(
            self.front_frame,
            text="Let's Start",
            font=("Helvetica", 16, "bold"),
            bg="#03C0C1",
            fg="white",
            relief="raised",
            bd=4,
            command=self.main_app,
        )
        start_button.pack(pady=20)

    def main_app(self):
        # Remove the front page and load the main application
        self.front_frame.destroy()
        self.create_tabs()

    def create_tabs(self):
            # Main application tabs
        self.tab_control = ttk.Notebook(self.root)
        style = ttk.Style(self.root)
                    # Define styles for the tabs (adjusting the background and text)
             # Set the background color for the notebook itself
        style.configure("TNotebook", background="#E0F7FA")

        # Set properties for the tabs
        style.configure("TNotebook.Tab", 
                        background="#008080",  # Set tab background color
                        foreground="black",     # Set tab text color
                        padding=[5, 5],         # Adjust tab padding for better readability
                        borderwidth=2,          # Add border around the tabs
                        relief="solid")         # Set a solid border for the tabs

        # Map properties when the tab is selected
        style.map("TNotebook.Tab", 
                  background=[("selected", "#800080")],  # Set selected tab background color
                  foreground=[("selected", "black")],    # Set selected tab text color
                  relief=[("selected", "sunken")])       # Set a sunken effect for the selected tab to simulate a border effect

        # Define the tabs
        self.tab_search = ttk.Frame(self.tab_control)
        self.tab_bioactivity = ttk.Frame(self.tab_control)
        self.tab_model = ttk.Frame(self.tab_control)
        self.tab_prediction = ttk.Frame(self.tab_control)
        self.tab_3d_structure = ttk.Frame(self.tab_control)  # 3D Structure Tab

        # Add tabs to the control
        self.tab_control.add(self.tab_search, text="Search Protein")
        self.tab_control.add(self.tab_bioactivity, text="Bioactivity Data")
        self.tab_control.add(self.tab_model, text="Model Training")
        self.tab_control.add(self.tab_prediction, text="Prediction")
        self.tab_control.add(self.tab_3d_structure, text="3D Structure Prediction")  # 3D Structure Tab

        # Pack the tab control without extra padding
        self.tab_control.pack(expand=1, fill="both")

        # Create the content for each tab
        self.create_search_tab()
        self.create_bioactivity_tab()
        self.create_model_tab()
        self.create_prediction_tab()
        self.create_3d_structure_tab()  # 3D Structure Tab
        
    def create_search_tab(self):
        # Protein Search
        tk.Label(self.tab_search, text="Enter Protein Name:", font=("Helvetica", 12, "bold"), bg="#F0F8FF").pack(pady=5)
        self.protein_entry = tk.Entry(self.tab_search, width=50, font=("Helvetica", 12), relief="solid", bd=2)
        self.protein_entry.pack(pady=5)

        self.search_button = tk.Button(self.tab_search, text="Search Protein", command=self.search_protein, font=("Helvetica", 12, "bold"), bg="#4CAF50", fg="white", relief="raised", bd=4)
        self.search_button.pack(pady=10)

        self.result_treeview = ttk.Treeview(self.tab_search, columns=("Index", "Protein", "Organism"), show="headings", height=10)
        self.result_treeview.heading("Index", text="Index", anchor="center")
        self.result_treeview.heading("Protein", text="Protein", anchor="center")
        self.result_treeview.heading("Organism", text="Organism", anchor="center")
        self.result_treeview.column("Index", anchor="center")
        self.result_treeview.column("Protein", anchor="center")
        self.result_treeview.column("Organism", anchor="center")
        self.result_treeview.pack(pady=10)


    def search_protein(self):
            self.protein_name = self.protein_entry.get()
            
            if not self.protein_name:
                messagebox.showwarning("Input Error", "Please enter a protein name.")
                return
        
            try:
                # Call search_target function to get the data
                self.targets_df = search_target(self.protein_name)
                
                if self.targets_df.empty:
                    messagebox.showinfo("No Results", "No protein found.")
                    return
            
            except Exception as e:
                # Catch any exceptions and display an error message
                messagebox.showerror("Error", f"No protein found")
                return


            # Populate Treeview with results and add indexing
            for row in self.result_treeview.get_children():
                self.result_treeview.delete(row)
    
            for idx, (_, row) in enumerate(self.targets_df.iterrows(), start=1):
                self.result_treeview.insert("", "end", values=(idx, row["pref_name"], row["organism"]))

    def create_bioactivity_tab(self):
        tk.Label(self.tab_bioactivity, text="Select Target Protein Index:", font=("Helvetica", 12, "bold"), bg="#F0F8FF").pack(pady=10)

        self.target_index_entry = tk.Entry(self.tab_bioactivity, width=50, font=("Helvetica", 12), relief="solid", bd=2)
        self.target_index_entry.pack(pady=5)

        self.fetch_button = tk.Button(self.tab_bioactivity, text="Fetch Bioactivity Data", command=self.fetch_bioactivity_data, font=("Helvetica", 12, "bold"), bg="#4CAF50", fg="white", relief="raised", bd=4)
        self.fetch_button.pack(pady=10)

        self.bioactivity_label = tk.Label(self.tab_bioactivity, text="Bioactivity Data:", font=("Helvetica", 12, "bold"), bg="#F0F8FF")
        self.bioactivity_label.pack(pady=10)

        self.bioactivity_treeview = ttk.Treeview(self.tab_bioactivity, columns=("Molecule", "pIC50", "Activity"), show="headings", height=10)
        self.bioactivity_treeview.heading("Molecule", text="Molecule", anchor="center")
        self.bioactivity_treeview.heading("pIC50", text="pIC50", anchor="center")
        self.bioactivity_treeview.heading("Activity", text="Activity", anchor="center")
        self.bioactivity_treeview.column("Molecule", anchor="center")
        self.bioactivity_treeview.column("pIC50", anchor="center")
        self.bioactivity_treeview.column("Activity", anchor="center")
        self.bioactivity_treeview.pack(pady=10)

    def fetch_bioactivity_data(self):
        try:
            target_index = int(self.target_index_entry.get())
            self.bioactivity_df = get_bioactivity_data_for_target(self.targets_df, target_index)
            if self.bioactivity_df is not None:
                self.bioactivity_df = preprocess_bioactivity_data(self.bioactivity_df)

                for row in self.bioactivity_treeview.get_children():
                    self.bioactivity_treeview.delete(row)

                for _, row in self.bioactivity_df.iterrows():
                    activity = "Active" if row['standard_value'] <= 1000 else "Inactive"
                    self.bioactivity_treeview.insert("", "end", values=(row["molecule_chembl_id"], row["standard_value"], activity))

                self.bioactivity_label.config(text="Bioactivity Data Fetched!", fg="green")
            else:
                messagebox.showwarning("Error", "Unable to fetch bioactivity data.")
        except ValueError:
            messagebox.showwarning("Input Error", "Please enter a valid index.")

    def create_model_tab(self):
        self.train_button = tk.Button(self.tab_model, text="Train Model", command=self.train_model, font=("Helvetica", 12, "bold"), bg="#4CAF50", fg="white", relief="raised", bd=4)
        self.train_button.pack(pady=20)
        self.model_status_label = tk.Label(self.tab_model, text="Model Status: Not Trained", font=("Helvetica", 12), bg="#F0F8FF")
        self.model_status_label.pack(pady=10)
        self.accuracy_label = tk.Label(self.tab_model, text="Model Accuracy: Not Available", font=("Helvetica", 12), bg="#F0F8FF")
        self.accuracy_label.pack(pady=10)

    def train_model(self):
    # Trigger model training in a separate thread to avoid GUI freezing
        def training_thread():
            self.model_status_label.config(text="Training the model... Please wait.", font=("Helvetica", 14, "bold"), fg="orange")
            self.model_status_label.update()

            try:
                # Simulate training process
                time.sleep(3)  # Simulating a delay

                # Call the main function and get the model, scaler, and performance metrics
                model, scaler, performance_df = main_function()

                if model and scaler:
                    self.model_status_label.config(text="Model Status: Trained", font=("Helvetica", 14, "bold"), fg="green")
                    messagebox.showinfo("Training Completed", "Model has been trained successfully.")

                    # Show model performance metrics in the GUI
                    performance_text = f"R²: {performance_df['R²'][0]:.3f}\n"
                    performance_text += f"MAE: {performance_df['Mean Absolute Error (MAE)'][0]:.3f}\n"
                    performance_text += f"MSE: {performance_df['Mean Squared Error (MSE)'][0]:.3f}\n"
                    performance_text += f"Custom Accuracy (within 20%): {performance_df['Custom Accuracy (within 20% threshold)'][0]:.3f}"

                    # Update accuracy_label with performance metrics
                    self.accuracy_label.config(text=performance_text, font=("Helvetica", 12), fg="green")
                else:
                    raise Exception("Model training failed.")

            except Exception as e:
                self.model_status_label.config(text="Error during training", font=("Helvetica", 14, "bold"), fg="red")
                messagebox.showerror("Error", f"An error occurred: {e}")
                print(f"Error in training: {e}")  # Print error to console for debugging

    # Start the training thread
        threading.Thread(target=training_thread).start()



    def create_prediction_tab(self):
        self.upload_button = tk.Button(self.tab_prediction, text="Upload CSV for Prediction", font=("Arial", 12, "bold"), command=self.upload_csv_for_prediction)
        self.upload_button.pack(pady=20)

        self.prediction_status_label = tk.Label(self.tab_prediction, text="Prediction Status: No Prediction", font=("Arial", 12, "bold"))
        self.prediction_status_label.pack(pady=10)

        self.download_button = tk.Button(self.tab_prediction, text="Download Prediction Results", font=("Arial", 12, "bold"), command=self.download_prediction_result, state=tk.DISABLED)
        self.download_button.pack(pady=10)

        # Add Treeview widget to display prediction results
        self.prediction_treeview = ttk.Treeview(self.tab_prediction, columns=("Canonical SMILES", "Predicted pIC50", "Predicted Activity"), show="headings", height=10)
        self.prediction_treeview.heading("Canonical SMILES", text="Canonical SMILES", anchor="center")
        self.prediction_treeview.heading("Predicted pIC50", text="Predicted pIC50", anchor="center")
        self.prediction_treeview.heading("Predicted Activity", text="Predicted Activity", anchor="center")

        self.prediction_treeview.column("Canonical SMILES", anchor="center")
        self.prediction_treeview.column("Predicted pIC50", anchor="center")
        self.prediction_treeview.column("Predicted Activity", anchor="center")
        self.prediction_treeview.pack(pady=20)

    def upload_csv_for_prediction(self):
        file_path = filedialog.askopenfilename(filetypes=[("CSV Files", "*.csv")])
        if not file_path:
            return
        try:
            # Assuming predict_new_data_from_csv returns a DataFrame with columns: 'canonical_smiles', 'Predicted pIC50', 'Predicted Activity'
            self.prediction_result_df = predict_new_data_from_csv(file_path)

            # Update the prediction status and enable download button
            self.prediction_status_label.config(text="Prediction Status: Prediction Complete", fg="green")
            self.download_button.config(state=tk.NORMAL)

            # Clear previous data in the treeview
            for row in self.prediction_treeview.get_children():
                self.prediction_treeview.delete(row)

            # Insert new rows based on prediction results
            for _, row in self.prediction_result_df.iterrows():
                activity = "Active" if row['Predicted pIC50'] >= 5 else "Inactive"  # Example logic for activity
                # Insert data using the correct 'canonical_smiles' column name from the DataFrame
                self.prediction_treeview.insert("", "end", values=(row["Canonical SMILES"], row["Predicted pIC50"], activity))

        except Exception as e:
            messagebox.showerror("Error", f"An error occurred: {e}")

    def download_prediction_result(self):
        if hasattr(self, 'prediction_result_df'):
            save_path = filedialog.asksaveasfilename(defaultextension=".csv", filetypes=[("CSV Files", "*.csv")])
            if save_path:
                self.prediction_result_df.to_csv(save_path, index=False)
                messagebox.showinfo("Download Complete", "Prediction results saved successfully!")
        else:
            messagebox.showwarning("No Prediction", "No prediction results to save.")


    def smiles_to_3D_pdb(self):
        smiles = self.smiles_entry_3d.get()
        if not smiles:
            messagebox.showerror("Error", "Please enter a valid SMILES string.")
            return
    
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            messagebox.showerror("Error", "Invalid SMILES string.")
            return
    
        mol_3d = Chem.AddHs(mol)  # Add hydrogen atoms
        AllChem.EmbedMolecule(mol_3d, randomSeed=42)
        AllChem.UFFOptimizeMolecule(mol_3d)
    
        # Save the 3D structure as a PDB file
        pdb_file = filedialog.asksaveasfilename(defaultextension=".pdb", filetypes=[("PDB files", "*.pdb")])
        if pdb_file:
            rdmolfiles.MolToPDBFile(mol_3d, pdb_file)
            messagebox.showinfo("Success", "3D Structure saved successfully.")
    
        # Generate 2D image for display
        img = Draw.MolToImage(mol, size=(300, 300))
    
        # Display the 2D image in the GUI
        img = ImageTk.PhotoImage(image=Image.fromarray(np.array(img)))
        if hasattr(self, 'mol_image_label'):
            self.mol_image_label.config(image=img)
            self.mol_image_label.image = img
        else:
            self.mol_image_label = tk.Label(self.tab_3d_structure, image=img)
            self.mol_image_label.image = img
            self.mol_image_label.pack(pady=10)
    
        
    def create_3d_structure_tab(self):
        self.smiles_label = tk.Label(self.tab_3d_structure, text="Enter SMILES for structure prediction:", font=("Helvetica", 12, "bold"), bg="#F0F8FF")
        self.smiles_label.pack(pady=10)
    
        self.smiles_entry_3d = tk.Entry(self.tab_3d_structure, font=("Helvetica", 12), relief="solid", bd=2)
        self.smiles_entry_3d.pack(pady=10)
    
        self.generate_2d_button = tk.Button(self.tab_3d_structure, text="Generate 2D Structure", command=self.generate_2d_structure, font=("Helvetica", 12, "bold"), bg="#4CAF50", fg="white", relief="raised", bd=4)
        self.generate_2d_button.pack(pady=10)
    
        self.generate_3d_button = tk.Button(self.tab_3d_structure, text="Download 3D Structure", command=self.smiles_to_3D_pdb, font=("Helvetica", 12, "bold"), bg="#4CAF50", fg="white", relief="raised", bd=4)
        self.generate_3d_button.pack(pady=10)

    def generate_2d_structure(self):
        smiles = self.smiles_entry_3d.get()
        if not smiles:
            messagebox.showerror("Error", "Please enter a valid SMILES string.")
            return
    
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            messagebox.showerror("Error", "Invalid SMILES string.")
            return
    
        # Generate 2D image
        img = Draw.MolToImage(mol, size=(300, 300))
    
        # Display the 2D image in the GUI
        img = ImageTk.PhotoImage(image=Image.fromarray(np.array(img)))
        if hasattr(self, 'mol_image_label'):
            self.mol_image_label.config(image=img)
            self.mol_image_label.image = img
        else:
            self.mol_image_label = tk.Label(self.tab_3d_structure, image=img)
            self.mol_image_label.image = img
            self.mol_image_label.pack(pady=10)



# Run the Tkinter application
if __name__ == "__main__":
    root = tk.Tk()
    app = BioactivityModelingApp(root)
    root.mainloop()

Model trained and saved!
