In [1]:
import pandas as pd
import numpy as np
from numpy import inf

import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

seed=1234

In [2]:
def load_and_clean_train_csv(filepath, test_size=0.2, normilize=False):
    data = pd.read_csv(filepath,index_col=0)
    
    
    data = data.replace([np.inf, -np.inf, np.nan], 0)
    data = data.drop(["smiles"], axis=1)
    cols= data.columns
    
    if normilize:
        scaler = MinMaxScaler()
        scaler.fit(data)
        data = scaler.transform(data)
        data = pd.DataFrame(data, columns=cols)
    
    
    x = data.drop(['task1',"task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11"],axis=1)
    X = np.array(x)
    y = np.array(data[['task1',"task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11"]])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=1)
    
    y_train = y_train.astype('float32')
    X_train = X_train.astype("float32")
    X_test = X_test.astype("float32")
    
    y_train[y_train == -inf] = 0.0
    y_train[y_train == inf] = 0.0
    X_train[X_train == -inf] = 0.0
    X_train[X_train == inf] = 0.0
    y_test[y_test == inf] = 0.0
    y_test[y_test == -inf] = 0.0
    X_test[X_test== inf] = 0.0
    X_test[X_test== -inf] = 0.0
    
    return X_train, X_test, y_train, y_test

In [3]:
def load_eval_csv(filepath, normilize=False):
    data = pd.read_csv(filepath,index_col=0)
    data = data.replace([np.inf, -np.inf, np.nan], 0)
    data = data.drop(['smiles'],axis=1)
    cols = data.columns
    
    if normilize:
        scaler = MinMaxScaler()
        scaler.fit(eval_data)
        data = scaler.transform(eval_data)
        data = pd.DataFrame(data, columns=cols)
    
    data = data.astype('float32')
    data[data==-inf]=0.0
    data[data==inf]=0.0
    
    return data

In [4]:
def save_submission_file(predictions):
    yhatdf = pd.DataFrame(predictions)
    yhatdf.columns = ['task1',"task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11"]
    yhatdf.to_csv("./submission.csv")

## Loading the data 

In [5]:
X_train_descriptors, X_test_descriptors, y_train_descriptors, y_test_descriptors = load_and_clean_train_csv("./data/data_train_descriptors.csv")
X_train_fingerprints, X_test_fingerprints, y_train_fingerprints, y_test_fingerprints = load_and_clean_train_csv("./data/data_train_fingerprints.csv")

<h1 style="color:rgb(0,120,170)">Random Forest (with descriptors)</h1>

In [6]:
rf_model_descriptors = RandomForestClassifier(n_estimators=500, random_state=seed, n_jobs=-1, max_features="auto", max_samples=2000, min_impurity_decrease = 0.0)
rf_model_descriptors.fit(X_train_descriptors, y_train_descriptors)
yhat = rf_model_descriptors.predict(X_test_descriptors)
np.mean(y_test_descriptors == yhat)

0.8883333333333333

<h1 style="color:rgb(0,120,170)">Random Forest (with fingerprints)</h1>

In [7]:
rf_model_fingerprints = RandomForestClassifier(n_estimators=500, random_state=seed, max_samples=200)
rf_model_fingerprints.fit(X_train_fingerprints, y_train_fingerprints)
yhat = rf_model_fingerprints.predict(X_test_fingerprints)
print(f"Accuracy:{np.mean(y_test_fingerprints== yhat)}")

Accuracy:0.8686742424242424


<h1 style="color:rgb(0,120,170)">Decision Trees (with descriptors) </h1>

In [8]:
model_decision_tree_descriptors = DecisionTreeClassifier(random_state=seed,  criterion="entropy", min_samples_split=200, splitter="best")
model_decision_tree_descriptors.fit(X_train_descriptors, y_train_descriptors)
predictions = model_decision_tree_descriptors.predict(X_test_descriptors)
print(f"Accuracy:{np.mean(y_test_descriptors== predictions)}")

Accuracy:0.8806060606060606


<h1 style="color:rgb(0,120,170)">Decision Trees (with fingerprints)</h1>

In [9]:
model_decision_tree_descriptors = DecisionTreeClassifier(random_state=seed,  criterion="entropy", min_samples_split=200, splitter="best")
model_decision_tree_descriptors.fit(X_train_fingerprints, y_train_fingerprints)
predictions = model_decision_tree_descriptors.predict(X_test_fingerprints)
print(f"Accuracy:{np.mean(y_test_fingerprints== predictions)}")

Accuracy:0.8651136363636364


<h1 style="color:rgb(0,120,170)">K Neighbors Classifer (with descriptors)</h1>

In [10]:
neigh_model_descriptors = KNeighborsClassifier(n_neighbors=600, weights="uniform", n_jobs=-1, algorithm='brute', leaf_size=3)
neigh_model_descriptors.fit(X_train_descriptors, y_train_descriptors)
predictions = neigh_model_descriptors.predict(X_test_descriptors)
print(f"Accuracy:{np.mean(y_test_descriptors== predictions)}")

Accuracy:0.8676515151515152


<h1 style="color:rgb(0,120,170)">K Neighbors Classifer (with fingerprints)</h1>

In [11]:
neigh_model_fingerprints = KNeighborsClassifier(n_neighbors=600, weights="uniform", n_jobs=-1, algorithm='brute', leaf_size=3)
neigh_model_fingerprints.fit(X_train_fingerprints, y_train_fingerprints)
predictions = neigh_model_fingerprints.predict(X_test_fingerprints)
print(f"Accuracy:{np.mean(y_test_fingerprints == predictions)}")

Accuracy:0.8620454545454546


# Save best model for submission

In [19]:
eval_x = load_eval_csv("./data/data_test_descriptors.csv")
eval_predictions = rf_model_descriptors.predict(eval_x)
save_submission_file(eval_predictions)
print("Saved submission file to ./submission.csv")

Saved submission file to ./submission.csv
