In [66]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

import evalml

In [90]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

In [91]:
train.head()

Unnamed: 0,id,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
0,1,M,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,15
1,2,I,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,8
2,3,I,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,18
3,4,M,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,13
4,5,I,0.31,0.235,0.09,0.127,0.048,0.031,0.04,6


In [92]:
train = pd.get_dummies(columns = ['Gender'], data = train, drop_first = True)
test = pd.get_dummies(columns = ['Gender'], data = test, drop_first = True)

In [93]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error

In [94]:
def NMAE(true, pred) -> float:
    mae = np.mean(np.abs(true - pred))
    score = mae / np.mean(np.abs(true))
    return score

In [123]:
X = train.drop(['id', 'Target'], axis = 1)
y = np.log1p(train.Target)

In [124]:
target = test[X.columns]

In [125]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [126]:
from evalml.automl import AutoMLSearch

In [127]:
automl = AutoMLSearch(X_train = X_train, y_train = y_train, problem_type = 'regression', objective = 'MAE', additional_objectives = ['MSE'], optimize_thresholds = True)

In [128]:
automl.search()

In [129]:
pipeline = automl.best_pipeline

In [130]:
automl.describe_pipeline(automl.rankings.iloc[0]['id'])


**************************************
* Random Forest Regressor w/ Imputer *
**************************************

Problem Type: regression
Model Family: Random Forest

Pipeline Steps
1. Imputer
	 * categorical_impute_strategy : most_frequent
	 * numeric_impute_strategy : mean
	 * categorical_fill_value : None
	 * numeric_fill_value : None
2. Random Forest Regressor
	 * n_estimators : 100
	 * max_depth : 6
	 * n_jobs : -1

Training
Training for regression problems.
Total training time (including CV): 0.4 seconds

Cross Validation
----------------
              MAE   MSE # Training # Validation
0           0.141 0.038        668          334
1           0.139 0.033        668          334
2           0.144 0.037        668          334
mean        0.141 0.036          -            -
std         0.002 0.002          -            -
coef of var 0.016 0.065          -            -


In [131]:
pred = np.expm1(pipeline.predict(X_val))

In [132]:
NMAE(np.expm1(y_val), pred)

0.14155640938074374

In [133]:
automl = AutoMLSearch(X_train = X, y_train = y, problem_type = 'regression', objective = 'MAE', additional_objectives = ['MSE'], optimize_thresholds = True)

In [134]:
automl.search()

In [135]:
pipeline = automl.best_pipeline

In [136]:
automl.describe_pipeline(automl.rankings.iloc[0]['id'])


**************************************
* Random Forest Regressor w/ Imputer *
**************************************

Problem Type: regression
Model Family: Random Forest

Pipeline Steps
1. Imputer
	 * categorical_impute_strategy : most_frequent
	 * numeric_impute_strategy : mean
	 * categorical_fill_value : None
	 * numeric_fill_value : None
2. Random Forest Regressor
	 * n_estimators : 100
	 * max_depth : 6
	 * n_jobs : -1

Training
Training for regression problems.
Total training time (including CV): 0.5 seconds

Cross Validation
----------------
              MAE   MSE # Training # Validation
0           0.138 0.033        835          418
1           0.144 0.037        835          418
2           0.128 0.028        836          417
mean        0.137 0.033          -            -
std         0.009 0.005          -            -
coef of var 0.062 0.143          -            -


In [137]:
pred = np.expm1(pipeline.predict(target))

In [138]:
submission['Target'] = pred

In [139]:
submission.to_csv('evalml_3rd.csv', index = False)