### Introduction

The project aims to build a model to predict drug efficacy of molecules.

A pre-processed HIV dataset with 3 classess (CA - Confirmed active, CM - Confirmed moderately active, CI - Confirmed inactive and benign) is available [here](http://moleculenet.ai/datasets-1). The raw data is available [here](https://wiki.nci.nih.gov/display/NCIDTPdata/AIDS+Antiviral+Screen+Data)
We perform stratified random splitting on the dataset: 80% of the images are in train set and 20% of the images are in test set.

#### Import required library

In [1]:
import os
import sys
sys.path.insert(0, os.getcwd())
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from sklearn import config_context
from config import Config
#from utils.data import read_data, get_prediction_score
from sklearn import feature_extraction
import pickle
import warnings
import argparse


In [4]:
from logging import warning


def train_model(model, X_train, y_train, parameters, n_splits=3):
    '''Train model with Grid-Search corss validation to find the best hyparameter
    :param model: Scikit-Learn estimator
    :param X_train : train set features
    :param y_train: trainset label
    : param parameters: dict, key is hyper parametr name and value is a list of jyper parameter values
    return best_estimator: Scikit-learn estimator with the best hyper parameter
    :return best_score: best accuracy score
    :return best_score: best accuracy score
    :return best_param: dict, best hyper parameter
    '''

    splits = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0).split(X=X_train, y=y_train)
    clf = GridSearchCV(model, parameters, cv = splits, scoring=make_scorer(accuracy_score))
    with warnings.catch_warnings():
        warnings.catch_warnings('ignore')
        clf.fit(X_train, y_train)

    return clf.best_estimator_, clf.best_score_, clf.best_params_
