In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

In [2]:
invertebrate_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/Invertebrate/Invertebrate_dataset.csv')
invertebrate_data.head()

Unnamed: 0,SWI,SWF,temperature,size,management,duration
0,0.59,1.3,3.5,0.5,2,20
1,1.21,1.85,4.6,47.2,3,19
2,1.08,0.97,4.8,53.3,4,22
3,1.6,1.67,5.2,27.2,8,20
4,2.95,2.41,5.7,38.8,6,22


In [3]:
invertebrate_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 280 entries, 0 to 279
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   SWI          280 non-null    float64
 1   SWF          280 non-null    float64
 2   temperature  280 non-null    float64
 3   size         280 non-null    float64
 4   management   280 non-null    int64  
 5   duration     280 non-null    int64  
dtypes: float64(4), int64(2)
memory usage: 13.2 KB


In [4]:
invertebrate_data.describe()

Unnamed: 0,SWI,SWF,temperature,size,management,duration
count,280.0,280.0,280.0,280.0,280.0,280.0
mean,1.896714,1.480679,16.3725,41.138214,4.082143,31.307143
std,0.610995,0.476403,3.619312,19.434496,2.520959,4.198304
min,0.48,0.25,3.5,0.5,0.0,19.0
25%,1.5075,1.19,14.1,27.275,2.0,29.0
50%,1.825,1.495,17.05,41.1,4.0,32.0
75%,2.2825,1.79,19.125,54.725,6.0,34.0
max,3.79,2.84,21.6,94.2,8.0,41.0


In [5]:
X = invertebrate_data.drop('SWI', axis = 1) 
y = invertebrate_data['SWI']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 53)

In [83]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)

In [84]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

lr_pred = lr.predict(X_test)

mse_lr = mean_squared_error(y_test, lr_pred)
r2_lr = r2_score(y_test, lr_pred)
mae_lr = mean_absolute_error(y_test, lr_pred)
rmse_lr = (np.sqrt(mean_squared_error(y_test, lr_pred)))
print("MAE of the LR model is: {}".format(mae_lr))
print("MSE of the LR model is: {}".format(mse_lr))
print("R2 score of the LR model is: {}".format(r2_lr))
print("RMSE of the LR model is: {}".format(rmse_lr))

MAE of the LR model is: 0.31824985599320293
MSE of the LR model is: 0.1511183733577655
R2 score of the LR model is: 0.5553969069362182
RMSE of the LR model is: 0.3887394671984895


In [36]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(max_depth=2, max_features='auto', random_state=53)
dtr.fit(X_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=2,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=53, splitter='best')

In [69]:
dtr_pred = dtr.predict(X_test)

mae_dtr = mean_absolute_error(y_test, dtr_pred)
mse_dtr = mean_squared_error(y_test, dtr_pred)
r2_dtr = r2_score(y_test, dtr_pred)
rmse_dtr = (np.sqrt(mean_squared_error(y_test, dtr_pred)))
print("MAE of the DTR model is: {}".format(mae_lr))
print("MSE of the DTR model is: {}".format(mse_dtr))
print("R2 score of the DTR model is: {}".format(r2_dtr))
print("RMSE of the DTR model is: {}".format(rmse_dtr))

MAE of the DTR model is: 0.3069470726441152
MSE of the DTR model is: 0.19442435270895664
R2 score of the DTR model is: 0.4279870365155449
RMSE of the DTR model is: 0.4409357693689146


In [43]:
from sklearn.ensemble import RandomForestRegressor

In [77]:
rfr = RandomForestRegressor(max_depth=4, max_features='auto', random_state=42, oob_score=True)
rfr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=4, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=True,
                      random_state=42, verbose=0, warm_start=False)

In [78]:
rfr_pred = rfr.predict(X_test)

mae_rfr = mean_absolute_error(y_test, rfr_pred)
mse_rfr = mean_squared_error(y_test, rfr_pred)
r2_rfr = r2_score(y_test, rfr_pred)
rmse_rfr = (np.sqrt(mean_squared_error(y_test, rfr_pred)))
print("MAE of the RFR model is: {}".format(mae_rfr))
print("MSE of the RFR model is: {}".format(mse_rfr))
print("R2 score of the RFR model is: {}".format(r2_rfr))
print("RMSE of the RFR model is: {}".format(rmse_rfr))

MAE of the RFR model is: 0.2936454352787597
MSE of the RFR model is: 0.14562044395692522
R2 score of the RFR model is: 0.5715722823240469
RMSE of the RFR model is: 0.38160246848903534


In [104]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor(max_depth=1, learning_rate=0.333) #, booster='dart') #, eval_metric='error@0.7') #, objective='binary:hinge')
xgb_model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.333, max_delta_step=0, max_depth=1,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [105]:
xgb_pred = xgb_model.predict(X_test)

mae_xgb = mean_absolute_error(y_test, xgb_pred)
mse_xgb = mean_squared_error(y_test, xgb_pred)
r2_xgb = r2_score(y_test, xgb_pred)
rmse_xgb = (np.sqrt(mean_squared_error(y_test, xgb_pred)))
print("MAE of the XGB model is: {}".format(mae_xgb))
print("MSE of the XGB model is: {}".format(mse_xgb))
print("R2 score of the XGB model is: {}".format(r2_xgb))
print("RMSE of the XGB model is: {}".format(rmse_xgb))

MAE of the XGB model is: 0.30101592801866073
MSE of the XGB model is: 0.14220237759111448
R2 score of the XGB model is: 0.5816285239627719
RMSE of the XGB model is: 0.37709730520266843


In [38]:
test_new = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/Invertebrate/Invertebrate_new_test_data.csv')
test_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   SWF          120 non-null    float64
 1   temperature  120 non-null    float64
 2   size         120 non-null    float64
 3   management   120 non-null    int64  
 4   duration     120 non-null    int64  
dtypes: float64(3), int64(2)
memory usage: 4.8 KB


In [39]:
lr_predictions = lr.predict(test_new)

In [40]:
res = pd.DataFrame(lr_predictions) #preditcions are nothing but the final predictions of your model on input features of your new unseen test data
res.index = test_new.index # its important for comparison. Here "test_new" is your new test dataset
res.columns = ["prediction"]
res.to_csv("prediction_results_lr_pred_model.csv")

In [41]:
dtr_predictions = dtr.predict(test_new)

In [42]:
res2 = pd.DataFrame(dtr_predictions) #preditcions are nothing but the final predictions of your model on input features of your new unseen test data
res2.index = test_new.index # its important for comparison. Here "test_new" is your new test dataset
res2.columns = ["prediction"]
res2.to_csv("prediction_results_dtr_pred_model.csv")

In [106]:
xgb_predictions = xgb_model.predict(test_new)

In [107]:
res3 = pd.DataFrame(xgb_predictions) #preditcions are nothing but the final predictions of your model on input features of your new unseen test data
res3.index = test_new.index # its important for comparison. Here "test_new" is your new test dataset
res3.columns = ["prediction"]
res3.to_csv("prediction_results_xgb_pred_model.csv")