# Machine learning with random forest and SVM

## 1.) Random forest

In [10]:
## import required packages
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [7]:
## import datasets
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
print(train.head())

## split features and target variable
X_train = train.drop(["id", "SMILES", "Tm"], axis = 1)
y_train = train["Tm"]
X_test = test.drop(["id", "SMILES"], axis = 1) 
id_test = test["id"] 

     id                       SMILES      Tm  Group 1  Group 2  Group 3  \
0  2175        FC1=C(F)C(F)(F)C1(F)F  213.15        0        0        0   
1  1222  c1ccc2c(c1)ccc3Nc4ccccc4c23  407.15        0        0        0   
2  2994          CCN1C(C)=Nc2ccccc12  324.15        2        1        0   
3  1704                   CC#CC(=O)O  351.15        1        0        0   
4  2526                    CCCCC(S)C  126.15        2        3        0   

   Group 4  Group 5  Group 6  Group 7  ...  Group 415  Group 416  Group 417  \
0        0        0        0        0  ...          0          0          0   
1        0        0        0        0  ...          0          0          0   
2        0        0        0        0  ...          0          0          0   
3        0        0        0        0  ...          0          0          0   
4        0        0        0        0  ...          0          0          0   

   Group 418  Group 419  Group 420  Group 421  Group 422  Group 423  Group

### 1.1.) RandomizedSearchCV

- defining a range/distribution of values for each hyperparameter --> random sampling of a fixed number of combinations
- faster than Grid Search; good for a first pass

In [3]:
## parameter grid to sample from

parameters_grid = {
    "n_estimators": [100, 200, 400, 600, 800, 1000],
    "max_depth": [10, 15, 20, 25, 30, 35, None],
    "min_samples_split": [2, 3, 5, 7, 10],
    "min_samples_leaf": [1, 3, 5],
    "max_features": ["sqrt", "log2", 0.02, 0.05, 0.1, 0.2]} # percentages, e.g. 0.05 equals 5 % of features

## Model initialization
rf = RandomForestRegressor(random_state = 123)

## RandomizedSearchCV initialization
rf_randomized_search = RandomizedSearchCV(
    estimator = rf,
    param_distributions = parameters_grid,
    n_iter = 70,
    cv = 5,
    n_jobs = -1,
    scoring = "neg_mean_absolute_error",
    random_state = 123,
    error_score = "raise")

## RandomizedSearchCV fit to the training data
rf_randomized_search.fit(X_train, y_train)

## Extraction of best result
print("Best parameters found: ", rf_randomized_search.best_params_)
print("Best score (Neg. MAE): ", rf_randomized_search.best_score_)

Fitting 5 folds for each of 70 candidates, totalling 350 fits
Best parameters found:  {'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.2, 'max_depth': 30}
Best score (Neg. MAE):  -37.117318102559906


In [9]:
# random forest with found best parameters from randomized search

rf1 = RandomForestRegressor(n_estimators = 400, min_samples_split = 2, min_samples_leaf = 1, max_features = 0.2, max_depth = 30, 
                            random_state = 123).fit(X_train, y_train)
y_pred = rf1.predict(X_test)
results_rf1 = pd.DataFrame({"id": id_test, "Tm": y_pred})
results_rf1.index.name = "id"
results_rf1.to_csv("prediction_rf1.csv", index=False)

## 1.2.) GridSearchCV

- every single combination of specified hyperparameters tested --> best combination within the defined grid found
- can be very slow for large grids
- good to use for fine-tuning after narrowing down via Random Search (done here)

In [13]:
## grid search 1

# grid with parameters to search (for fine-tuning of n_estimators, max_depth and max_features
parameters_grid = {
    "n_estimators": [400, 600],
    "max_depth": [20, 30, 40, None],
    "max_features": [0.1, 0.15, 0.2, 0.25, 0.3]}

# model initialisation
rf = RandomForestRegressor(min_samples_split = 2, min_samples_leaf = 1, random_state = 123)

# GridSearchCV initialisation
rf_grid_search = GridSearchCV(estimator = rf, param_grid = parameters_grid, cv = 5, scoring = "neg_mean_absolute_error", n_jobs = -1,
                              error_score = "raise")

# GridSearchCV fit to the training data
rf_grid_search.fit(X_train, y_train)

## Extraction of best result
print("Best parameters found: ", rf_grid_search.best_params_)
print("Best score (Neg. MAE): ", rf_grid_search.best_score_)

Best parameters found:  {'max_depth': None, 'max_features': 0.3, 'n_estimators': 600}
Best score (Neg. MAE):  -35.96683474779305


In [14]:
# random forest with found best parameters from grid search 1

rf2 = RandomForestRegressor(n_estimators = 600, min_samples_split = 2, min_samples_leaf = 1, max_features = 0.3, max_depth = None, 
                            random_state = 123).fit(X_train, y_train)
y_pred = rf2.predict(X_test)
results_rf2 = pd.DataFrame({"id": id_test, "Tm": y_pred})
results_rf2.index.name = "id"
results_rf2.to_csv("prediction_rf2.csv", index=False)

In [16]:
# random forest with found best parameters from grid search 2

rf3 = RandomForestRegressor(n_estimators = 400, min_samples_split = 2, min_samples_leaf = 1, max_features = 0.25, max_depth = 35, 
                            random_state = 123).fit(X_train, y_train)
y_pred = rf3.predict(X_test)
results_rf3 = pd.DataFrame({"id": id_test, "Tm": y_pred})
results_rf3.index.name = "id"
results_rf3.to_csv("prediction_rf3.csv", index=False)

## 2.) Support Vector Machine (SVM)

In [15]:
## grid search 2

parameters_grid = {
    "n_estimators": [350, 400, 450, 500],
    "max_depth": [25, 30, 35],
    "max_features": [0.15, 0.2, 0.25]}

rf = RandomForestRegressor(min_samples_split = 2, min_samples_leaf = 1, random_state = 123)

rf_grid_search = GridSearchCV(estimator = rf, param_grid = parameters_grid, cv = 5, scoring = "neg_mean_absolute_error", n_jobs = -1,
                              error_score = "raise")

rf_grid_search.fit(X_train, y_train)

print("Best parameters found: ", rf_grid_search.best_params_)
print("Best score (Neg. MAE): ", rf_grid_search.best_score_)

Best parameters found:  {'max_depth': 35, 'max_features': 0.25, 'n_estimators': 400}
Best score (Neg. MAE):  -36.463945340580196


In [None]:
## measures to prevent overfitting?!