# Load dataset

In [20]:
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/dataprofessor/data/master/delaney_solubility_with_descriptors.csv')
df

Unnamed: 0,MolLogP,MolWt,NumRotatableBonds,AromaticProportion,logS
0,2.59540,167.850,0.0,0.000000,-2.180
1,2.37650,133.405,0.0,0.000000,-2.000
2,2.59380,167.850,1.0,0.000000,-1.740
3,2.02890,133.405,1.0,0.000000,-1.480
4,2.91890,187.375,1.0,0.000000,-3.040
...,...,...,...,...,...
1139,1.98820,287.343,8.0,0.000000,1.144
1140,3.42130,286.114,2.0,0.333333,-4.925
1141,3.60960,308.333,4.0,0.695652,-3.893
1142,2.56214,354.815,3.0,0.521739,-3.790


# Data Preparation

## Data separation as X and y

In [21]:
# separate the y and x from the y = f(x)
y = df['logS']
y


0      -2.180
1      -2.000
2      -1.740
3      -1.480
4      -3.040
        ...  
1139    1.144
1140   -4.925
1141   -3.893
1142   -3.790
1143   -2.581
Name: logS, Length: 1144, dtype: float64

In [22]:
X = df.drop('logS',axis = 1)
X

Unnamed: 0,MolLogP,MolWt,NumRotatableBonds,AromaticProportion
0,2.59540,167.850,0.0,0.000000
1,2.37650,133.405,0.0,0.000000
2,2.59380,167.850,1.0,0.000000
3,2.02890,133.405,1.0,0.000000
4,2.91890,187.375,1.0,0.000000
...,...,...,...,...
1139,1.98820,287.343,8.0,0.000000
1140,3.42130,286.114,2.0,0.333333
1141,3.60960,308.333,4.0,0.695652
1142,2.56214,354.815,3.0,0.521739


## Data Splitting

In [23]:
# train and split data for the model
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)


# Model Building

## Linear Regression

###Training the model

In [24]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

lr.fit(X_train,y_train)

### Applying the model to make prediction

In [25]:
y_lr_train_predict = lr.predict(X_train)
y_lr_test_predict = lr.predict(X_test)

print(y_lr_train_predict)



[ -2.5755257   -7.18040072  -2.11707627  -2.20799967  -3.00008037
  -3.26387904  -5.07325955  -2.91585429  -1.07042987  -5.08003152
  -2.8997154   -1.69446129  -1.60860685  -2.72252337  -1.52573714
  -4.75893924  -6.47732392  -5.05362889  -1.63712822  -3.28466049
  -4.8603031   -2.39814715  -2.32464489  -2.05472547  -3.30010312
  -2.65538362  -1.59941678  -2.00104103  -3.56028984  -3.78936892
  -2.77958803  -4.72579029  -2.67115848  -1.80797697  -3.42655792
  -2.01327509  -1.52014141  -1.65747296  -2.14907287  -3.28706991
  -3.42071308  -2.43068169  -2.61829511  -5.44076948  -2.66114707
  -5.17959041  -0.79261764  -3.39189915  -4.16662088  -4.16150413
  -1.44832281  -2.05472547  -4.7496882   -6.56410797   0.6351185
  -5.86421975  -3.3177778   -3.35583368  -3.898149    -1.02543103
  -1.40853714  -5.08003152  -5.46013066  -3.92335948  -2.12909151
  -7.11934578  -2.56513128  -0.52896472  -4.65987768  -3.52845368
  -2.89673006  -5.40444674  -3.78218168  -3.95921963  -3.95390013
  -6.096090

In [26]:
print(y_lr_test_predict)

[-1.66223919 -0.94371161 -2.50623922 -1.4326952  -4.21786239 -4.23495453
 -2.34974873  0.65433301 -2.77958803  1.96533751 -2.03147266 -8.94711645
 -5.81233022 -3.72254972 -1.86128026 -4.4831273  -1.18926311 -5.01550123
 -0.77744795 -3.4553089  -2.05472547 -2.5755257  -0.30824045 -3.011901
 -2.61603483 -6.47732392 -0.27867707 -4.58687581 -2.72252337  0.92348149
 -7.24898541 -1.56452156 -3.66082486 -6.43880143 -3.98408504 -0.70651706
 -1.05034863 -4.23495453 -2.22603937 -5.4770705  -2.30805974  1.44326067
 -4.51269067 -2.72175045 -0.693019   -3.39179935 -5.77700393 -6.66860362
 -4.36140252 -1.8364382  -2.17188978 -2.91060133 -1.34957071 -4.38717919
 -5.1164761  -3.73343252 -1.10250351 -2.67529608  1.14219507 -4.47634634
 -1.90728014 -3.14585117 -2.39923054 -1.03606607 -3.83836016 -1.14546805
 -2.5755257  -8.59308364 -1.20074805 -1.62145942 -3.11024551 -3.06873821
 -2.64472473 -1.25754661 -3.97322346 -2.49784482 -3.09277946 -4.29216249
 -1.84126082 -4.89235634 -4.28643887 -7.88577487 -4.9

###Evaluate Model Performance

In [27]:
y_train

341   -2.54
147   -7.92
668   -2.10
211   -2.46
436   -2.51
       ... 
222   -2.09
93    -3.27
180   -2.82
613   -1.57
694   -1.80
Name: logS, Length: 915, dtype: float64

In [28]:
y_lr_train_predict

array([ -2.5755257 ,  -7.18040072,  -2.11707627,  -2.20799967,
        -3.00008037,  -3.26387904,  -5.07325955,  -2.91585429,
        -1.07042987,  -5.08003152,  -2.8997154 ,  -1.69446129,
        -1.60860685,  -2.72252337,  -1.52573714,  -4.75893924,
        -6.47732392,  -5.05362889,  -1.63712822,  -3.28466049,
        -4.8603031 ,  -2.39814715,  -2.32464489,  -2.05472547,
        -3.30010312,  -2.65538362,  -1.59941678,  -2.00104103,
        -3.56028984,  -3.78936892,  -2.77958803,  -4.72579029,
        -2.67115848,  -1.80797697,  -3.42655792,  -2.01327509,
        -1.52014141,  -1.65747296,  -2.14907287,  -3.28706991,
        -3.42071308,  -2.43068169,  -2.61829511,  -5.44076948,
        -2.66114707,  -5.17959041,  -0.79261764,  -3.39189915,
        -4.16662088,  -4.16150413,  -1.44832281,  -2.05472547,
        -4.7496882 ,  -6.56410797,   0.6351185 ,  -5.86421975,
        -3.3177778 ,  -3.35583368,  -3.898149  ,  -1.02543103,
        -1.40853714,  -5.08003152,  -5.46013066,  -3.92

In [29]:
from sklearn.metrics import mean_squared_error, r2_score
lr_train_mse = mean_squared_error(y_train,y_lr_train_predict)
lr_train_r2 = r2_score(y_train,y_lr_train_predict)

lr_test_mse = mean_squared_error(y_train,y_lr_train_predict)
lr_test_r2 = r2_score(y_train,y_lr_train_predict)

print("LR MSE (Train)",lr_train_mse)
print("LR R2 (Train)",lr_train_r2)
print("LR MSE (Test)",lr_test_mse)
print("LR R2 (Test)",lr_test_r2)



LR MSE (Train) 0.9984659705772158
LR R2 (Train) 0.7741608085418661
LR MSE (Test) 0.9984659705772158
LR R2 (Test) 0.7741608085418661


In [30]:
lr_results = pd.DataFrame(["Linear Regression",lr_train_mse,lr_train_r2,lr_test_mse,lr_test_r2]).transpose()
lr_results.columns = ["Method","Training MSE","Training R2","Testing MSE","Testing R2"]
lr_results

Unnamed: 0,Method,Training MSE,Training R2,Testing MSE,Testing R2
0,Linear Regression,0.998466,0.774161,0.998466,0.774161


## Random Forest

### Training the Model

In [31]:
from sklearn.ensemble import RandomForestRegressor  # regressor because y is quantitative and not classifier
rf = RandomForestRegressor(max_depth=2)
rf.fit(X_train,y_train)


### Applying the model to make prediction

In [32]:
y_rf_train_predict = rf.predict(X_train)
y_rf_test_predict = rf.predict(X_test)


### Evaluate Model Performance

In [33]:
from sklearn.metrics import mean_squared_error, r2_score
rf_train_mse = mean_squared_error(y_train,y_rf_train_predict)
rf_train_r2 = r2_score(y_train,y_rf_train_predict)

rf_test_mse = mean_squared_error(y_train,y_rf_train_predict)
rf_test_r2 = r2_score(y_train,y_rf_train_predict)



In [34]:
rf_results = pd.DataFrame(["Random Forest",rf_train_mse,rf_train_r2,rf_test_mse,rf_test_r2]).transpose()
rf_results.columns = ["Method","Training MSE","Training R2","Testing MSE","Testing R2"]
rf_results

Unnamed: 0,Method,Training MSE,Training R2,Testing MSE,Testing R2
0,Random Forest,1.028436,0.767382,1.028436,0.767382


## Model Comparison

In [36]:
df_models = pd.concat([lr_results,rf_results],axis=0).reset_index(drop=True)
df_models

Unnamed: 0,Method,Training MSE,Training R2,Testing MSE,Testing R2
0,Linear Regression,0.998466,0.774161,0.998466,0.774161
1,Random Forest,1.028436,0.767382,1.028436,0.767382
