In [1]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# Regression

In [1]:
from sklearn.datasets import make_regression

X, y = make_regression(n_samples=100, n_features=100)

In [24]:
import pandas as pd

df = pd.read_csv('wavelet.csv')

irr_columns = ['irreg_y_right', 'irreg_z_right', 'irreg_y_left', 'irreg_z_left']
X = df[[c for c in df.columns if c not in irr_columns]].to_numpy()
y = df[irr_columns].to_numpy()

In [28]:
import time
import numpy as np
from sklearn.pipeline import Pipeline
from xgboost.sklearn import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

In [29]:
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'Elastic Net': ElasticNet(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Simpler Neural Network': MLPRegressor(early_stopping=True),
    'XGBoost': XGBRegressor(verbosity=0)
}

parameters = {
    'Ridge': {'alpha': [1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3]},
    'Lasso': {'alpha': [1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3]},
    'Elastic Net': {'alpha': [1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3]},
    'Random Forest Regressor': {'bootstrap':[True, False], 'n_estimators':[10, 30, 100, 300]},
    'Simpler Neural Network': {'hidden_layer_sizes': [30, 100, 300], 'activation': ['logistic', 'tanh', 'relu']}
}

In [30]:
import pickle

def experimental(X, y, filename=None):
    ans = {}
    for c in ['Linear Regression', 'XGBoost']:
        start = time.process_time()
        
        pipeline = Pipeline([('transformer', StandardScaler()), ('estimator', models[c])])
        ans[c] = cross_validate(pipeline, X, y, scoring='r2')
        print('Elapsed time of {} is {:.6f} seconds.'.format(c, time.process_time() - start))

    for c in ['Ridge', 'Lasso', 'Elastic Net', 'Random Forest Regressor', 'Simpler Neural Network']:
        start = time.process_time()
        
        clf = Pipeline([('transformer', StandardScaler()), 
                        ('estimator', GridSearchCV(models[c], param_grid=parameters[c]))]).fit(X, y) # Grid search
        
        pipeline = Pipeline([('transformer', StandardScaler()), ('estimator', clf)])
        ans[c] = cross_validate(pipeline, X, y, scoring='r2')
        print('Elapsed time of {} is {:.6f} seconds.'.format(c, time.process_time() - start))
    
    pickle.dump(ans, open(filename, "wb" ))
    return ans

In [None]:
ans = experimental(X, y, 'wavelet')

Elapsed time of Linear Regression is 61.585555 seconds.
Elapsed time of XGBoost is 2.523960 seconds.
Elapsed time of Ridge is 438.911417 seconds.
Elapsed time of Lasso is 1273.599287 seconds.
Elapsed time of Elastic Net is 2010.287056 seconds.


# Classification

In [11]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=100, n_features=100)

In [13]:
import time
import numpy as np
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, GridSearchCV

In [14]:
n_clusters = len(np.unique(y))

models = {
    'Gaussian Naive Bayes': GaussianNB(),
    'KMeans': KMeans(n_clusters),
    'Knn': KNeighborsClassifier(weights='uniform'),
    'DistKnn': KNeighborsClassifier(weights='distance'),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
}

parameters = {
    'Knn': {'n_neighbors': [1, 3, 5, 7]},
    'DistKnn': {'n_neighbors': [1, 3, 5, 7]},
    'Decision Tree': {'max_depth': [None, 3, 5, 10]},
    'Random Forest': {'n_estimators': [10, 20, 50, 100]},
}

In [15]:
def experimental(X, y):
    ans = {}
    for c in ['Gaussian Naive Bayes', 'KMeans']:
        start = time.process_time()

        pipeline = Pipeline([('transformer', StandardScaler()), ('estimator', models[c])])
        ans[c] = cross_validate(pipeline, X, y)
        print('Elapsed time of {} is {:.6f} seconds.'.format(c, time.process_time() - start))

    for c in ['Knn', 'DistKnn', 'Decision Tree', 'Random Forest']:
        start = time.process_time()

        clf = Pipeline([('transformer', StandardScaler()), 
                        ('estimator', GridSearchCV(models[c], param_grid=parameters[c]))]).fit(X, y) # Grid search

        pipeline = Pipeline([('transformer', StandardScaler()), ('estimator', clf)])
        ans[c] = cross_validate(pipeline, X, y)
        print('Elapsed time of {} is {:.6f} seconds.'.format(c, time.process_time() - start))

    return ans

In [16]:
experimental(X, y)

Elapsed time of Gaussian Naive Bayes is 0.025817 seconds.
Elapsed time of KMeans is 0.074035 seconds.
Elapsed time of Knn is 0.214458 seconds.
Elapsed time of DistKnn is 0.157464 seconds.
Elapsed time of Decision Tree is 0.270921 seconds.
Elapsed time of Random Forest is 6.158460 seconds.


{'Gaussian Naive Bayes': {'fit_time': array([0.00452209, 0.00220418, 0.00186729, 0.00128984, 0.00109076]),
  'score_time': array([0.00069594, 0.00064611, 0.00067353, 0.00033283, 0.00031567]),
  'test_score': array([0.9 , 0.95, 0.85, 0.9 , 0.75])},
 'KMeans': {'fit_time': array([0.03517818, 0.01132059, 0.01212883, 0.01143217, 0.01163697]),
  'score_time': array([0.00517702, 0.00034595, 0.00043225, 0.00040507, 0.0003314 ]),
  'test_score': array([-2140.35689605, -2051.84410704, -1971.86442082, -2334.79903958,
         -1983.07186867])},
 'Knn': {'fit_time': array([0.03104281, 0.03134084, 0.03825474, 0.03026414, 0.03198004]),
  'score_time': array([0.00211573, 0.00107145, 0.00161767, 0.00116372, 0.00168037]),
  'test_score': array([0.8 , 0.75, 0.75, 0.7 , 0.8 ])},
 'DistKnn': {'fit_time': array([0.02557206, 0.02396393, 0.02423024, 0.02470303, 0.0237546 ]),
  'score_time': array([0.00068855, 0.000705  , 0.0006988 , 0.00070715, 0.00067711]),
  'test_score': array([0.8 , 0.75, 0.75, 0.7 , 0.