In [None]:
from datetime import time, datetime
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, max_error, mean_squared_error, mean_absolute_error

import matplotlib.pyplot as plt

from aml import AutoMLRegressor
import math
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [None]:
# Re-fetch the dataset from internet if needed
# dataset_path = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
# column_names = ["mpg", "cylinders", "displacement","horsepower","weight","acceleration","model year", "origin"]
# raw_dataset = pd.read_csv(dataset_path,names = column_names, sep=" ", na_values="?", skipinitialspace=True, comment="\t")
# raw_dataset.to_csv('data/auto-mpg.csv', index=False)
# raw_dataset.dtypes

In [None]:
raw_dataset = pd.read_csv('data/auto-mpg.csv')
raw_dataset.dropna(inplace=True)
X=raw_dataset.drop(['mpg'],axis=1)
y=raw_dataset[['mpg']]
y=np.ravel(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1234, test_size=0.2)

In [None]:
chosen_label = 'mpg'
categorical_features = ['model year', 'origin']
numeric_features = 'cylinders,displacement,horsepower,weight,acceleration'.split(',')

In [None]:
import matplotlib.pyplot as plt

def show_test_scores(m):
    for pipe, params, *_ in m.best_models:
        print("Model Type:", pipe[-1].__class__) #last step is a classifier
        print("Default pipeline parameters:", str(pipe))
        print("Best pipeline parameters:", str(params))
        y_pred = pipe.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        print(f"MAE: {mae}")
        mse = mean_squared_error(y_test, y_pred)
        print(f"RMSE: {math.sqrt(mse)}")
        r2 = r2_score(y_test, y_pred)
        print(f"R2 score: {r2}")
        # Plot predictions vs ground truth

def plot_true_vs_pred(y_test,y_pred):
       plt.figure(figsize=(8,8))
       plt.scatter(y_test,y_pred)
       plt.xlabel('True '+chosen_label)
       plt.ylabel('Predicted '+chosen_label)
       #plt.axis('square')
       max_val = np.max(y_test)
       min_val = np.min(y_test)
       plt.plot([min_val,max_val], [min_val,max_val])

In [None]:
start_time = datetime.now()
print(f'Started at {start_time}')
model = AutoMLRegressor('mse', 100, try_LR=False, try_DT=False, try_RF=False, try_HGB=True, try_GB=False, try_SVC=False, try_MLP=False)

model.fit(X_train, y_train, categorical=categorical_features, numeric=numeric_features)
#model.fit(X_train, y_train)
show_test_scores(model)

end_time = datetime.now()
print('Done.')
print(f'Duration: {end_time-start_time}')