In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
import ast

# Load the training data
train_file_path = 'Test set with MACCS keys.csv'
df = pd.read_csv(train_file_path)

# Convert the 'maccs_keys_bitstring' column to a DataFrame of binary features
maccs_keys_df = df['maccs_keys_bitstring'].apply(ast.literal_eval).apply(pd.Series)

# Rename columns to indicate they are from MACCS keys
maccs_keys_df.columns = [f'maccs_key_{i}' for i in range(maccs_keys_df.shape[1])]

# Drop the original 'maccs_keys' and 'maccs_keys_bitstring' columns
df = df.drop(columns=['maccs_keys', 'maccs_keys_bitstring'])

# Concatenate the new MACCS keys features with the original dataframe
df = pd.concat([df, maccs_keys_df], axis=1)

# Define the features and target variables
X = df.drop(columns=['SMILES', 'PKM2_inhibition', 'ERK2_inhibition'])
y = df[['PKM2_inhibition', 'ERK2_inhibition']]

# Remove rows with missing target values in y and corresponding rows in X
non_missing_indices = y.dropna().index
X = X.loc[non_missing_indices]
y = y.loc[non_missing_indices]

# Convert data to float32
X = X.astype(np.float32)
y = y.astype(np.float32)

# Split dataset into training set and test set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

# Create a RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model using the training sets
clf.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = clf.predict(X_val)

# Model Accuracy, how often is the classifier correct?
val_accuracy = metrics.accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", val_accuracy)

# Create a dataframe to compare the actual and predicted values
results_df = pd.DataFrame({
    'Molecule': df.loc[y_val.index, 'SMILES'],
    'PKM2_actual': y_val['PKM2_inhibition'].values,
    'ERK2_actual': y_val['ERK2_inhibition'].values,
    'PKM2_predicted': y_val_pred[:, 0],
    'ERK2_predicted': y_val_pred[:, 1]
})

# Save the results to a CSV file
output_file_path = "/mnt/data/predicted_molecules.csv"
results_df.to_csv(output_file_path, index=False)

# Calculate feature importances
feature_imp = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False)

# Plot the top 10 feature importances
plt.figure(figsize=(10, 6))
feature_imp.head(10).plot(kind='bar')
plt.title('Top 10 Feature Importances')
plt.ylabel('Importance Score')
plt.show()

# Load the test data
test_file_path = 'Test set with MACCS keys.csv'
test_df = pd.read_csv(test_file_path)

# Convert the 'maccs_keys_bitstring' column to a DataFrame of binary features in the test set
test_maccs_keys_df = test_df['maccs_keys_bitstring'].apply(ast.literal_eval).apply(pd.Series)

# Rename columns to indicate they are from MACCS keys
test_maccs_keys_df.columns = [f'maccs_key_{i}' for i in range(test_maccs_keys_df.shape[1])]

# Drop the original 'maccs_keys' and 'maccs_keys_bitstring' columns in the test set
test_df = test_df.drop(columns=['maccs_keys', 'maccs_keys_bitstring'])

# Concatenate the new MACCS keys features with the original test dataframe
test_df = pd.concat([test_df, test_maccs_keys_df], axis=1)

# Define the features and target variables for the test set
X_test = test_df.drop(columns=['SMILES', 'PKM2_inhibition', 'ERK2_inhibition'])
y_test = test_df[['PKM2_inhibition', 'ERK2_inhibition']]

# Remove rows with missing target values in y_test and corresponding rows in X_test_aligned
non_missing_indices = y_test.dropna().index
X_test = X_test.loc[non_missing_indices]
y_test = y_test.loc[non_missing_indices]

# Convert data to float32 for the test set
X_test = X_test.astype(np.float32)
y_test = y_test.astype(np.float32)

# Align the columns of the test set to match those of the training set
X_test_aligned = X_test.reindex(columns=X.columns, fill_value=0)

# Make predictions on the cleaned test set
y_test_pred_cleaned = clf.predict(X_test_aligned)

# Model Accuracy on the cleaned test set
test_accuracy_cleaned = metrics.accuracy_score(y_test, y_test_pred_cleaned)
print("Test Accuracy:", test_accuracy_cleaned)


ValueError: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.