In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC,SVC
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import balanced_accuracy_score
# Folder configuration
# ==============================================================================
from os import path
import sys
new_path = '../../scripts/'
if new_path not in sys.path:
    sys.path.append(new_path)

In [3]:

class MyAutoMLClassifier:
  def __init__(self, scoring_function = 'balanced_accuracy', n_iter = 50):
    self.scoring_function = scoring_function
    self.n_iter = n_iter
  
  def fit(self,X,y):
    X_train = X
    y_train = y

    categorical_values = []

    cat_subset = X_train.select_dtypes(include = ['object','category','bool'])

    for i in range(cat_subset.shape[1]):
      categorical_values.append(list(cat_subset.iloc[:,i].dropna().unique()))

    num_pipeline = Pipeline([
                         ('cleaner',SimpleImputer()),
                         ('scaler',StandardScaler())
                         ])

    cat_pipeline = Pipeline([
                        ('cleaner',SimpleImputer(strategy = 'most_frequent')),
                        ('encoder',OneHotEncoder(sparse = False, categories=categorical_values))
    ])


    preprocessor = ColumnTransformer([
      ('numerical', num_pipeline, make_column_selector(dtype_exclude=['object','category','bool'])),
      ('categorical', cat_pipeline, make_column_selector(dtype_include=['object','category','bool']))
    ])

    model_pipeline_steps = []
    model_pipeline_steps.append(('preprocessor',preprocessor))
    model_pipeline_steps.append(('feature_selector',SelectKBest(f_classif,k='all')))
    model_pipeline_steps.append(('estimator',LogisticRegression()))
    model_pipeline = Pipeline(model_pipeline_steps)

    total_features = preprocessor.fit_transform(X_train).shape[1]

    optimization_grid = []

    # Logistic regression
    optimization_grid.append({
        'preprocessor__numerical__scaler':[RobustScaler(),StandardScaler(),MinMaxScaler()],
        'preprocessor__numerical__cleaner__strategy':['mean','median'],
        'feature_selector__k': list(np.arange(1,total_features,5)) + ['all'],
        'estimator':[LogisticRegression()]
    })

    # K-nearest neighbors
    optimization_grid.append({
        'preprocessor__numerical__scaler':[RobustScaler(),StandardScaler(),MinMaxScaler()],
        'preprocessor__numerical__cleaner__strategy':['mean','median'],
        'feature_selector__k': list(np.arange(1,total_features,5)) + ['all'],
        'estimator':[KNeighborsClassifier()],
        'estimator__weights':['uniform','distance'],
        'estimator__n_neighbors':np.arange(1,20,1)
    })

    # Random Forest
    optimization_grid.append({
        'preprocessor__numerical__scaler':[None],
        'preprocessor__numerical__cleaner__strategy':['mean','median'],
        'feature_selector__k': list(np.arange(1,total_features,5)) + ['all'],
        'estimator':[RandomForestClassifier(random_state=0)],
        'estimator__n_estimators':np.arange(5,500,10),
        'estimator__criterion':['gini','entropy']
    })


    # Gradient boosting
    optimization_grid.append({
        'preprocessor__numerical__scaler':[None],
        'preprocessor__numerical__cleaner__strategy':['mean','median'],
        'feature_selector__k': list(np.arange(1,total_features,5)) + ['all'],
        'estimator':[GradientBoostingClassifier(random_state=0)],
        'estimator__n_estimators':np.arange(5,500,10),
        'estimator__learning_rate':np.linspace(0.1,0.9,20),
    })



    # Decision tree
    optimization_grid.append({
        'preprocessor__numerical__scaler':[None],
        'preprocessor__numerical__cleaner__strategy':['mean','median'],
        'feature_selector__k': list(np.arange(1,total_features,5)) + ['all'],
        'estimator':[DecisionTreeClassifier(random_state=0)],
        'estimator__criterion':['gini','entropy']
    })

    # Linear SVM
    optimization_grid.append({
        'preprocessor__numerical__scaler':[RobustScaler(),StandardScaler(),MinMaxScaler()],
        'preprocessor__numerical__cleaner__strategy':['mean','median'],
        'feature_selector__k': list(np.arange(1,total_features,5)) + ['all'],
        'estimator':[LinearSVC(random_state = 0)],
        'estimator__C': np.arange(0.1,1,0.1),
        
    })

    search = RandomizedSearchCV(
      model_pipeline,
      optimization_grid,
      n_iter=self.n_iter,
      scoring = self.scoring_function, 
      n_jobs = -1, 
      random_state = 0, 
      verbose = 3,
      cv = 5
    )

    search.fit(X_train, y_train)
    self.best_estimator_ = search.best_estimator_
    self.best_pipeline = search.best_params_
    

  
  def predict(self,X,y = None):
    return self.best_estimator_.predict(X)

  def predict_proba(self,X,y = None):
    return self.best_estimator_.predict_proba(X)

In [4]:
# Path folder configuration
# ===============================================================================

path = '../../data/'
file = 'raw/DelayedFlights.csv'

d = pd.read_csv(path+file)

In [5]:
d = d.drop(labels='Unnamed: 0', axis=1)

In [6]:
d = d.loc[:,["ArrDelay","ArrTime","AirTime", "Distance", "TaxiIn", "TaxiOut", "DayOfWeek", "DepDelay","CarrierDelay", 'UniqueCarrier']]

In [7]:
d = d.sample(frac=0.00001, random_state = 6858)

In [8]:
d

Unnamed: 0,ArrDelay,ArrTime,AirTime,Distance,TaxiIn,TaxiOut,DayOfWeek,DepDelay,CarrierDelay,UniqueCarrier
32721,81.0,1417.0,56.0,370,6.0,13.0,2,92.0,0.0,XE
480545,26.0,1932.0,100.0,317,12.0,17.0,4,8.0,8.0,EV
879313,20.0,1031.0,302.0,2105,8.0,18.0,1,13.0,0.0,DL
1670776,16.0,1731.0,42.0,236,7.0,14.0,6,29.0,0.0,UA
1464310,173.0,2322.0,280.0,2367,6.0,10.0,5,176.0,0.0,UA
831723,20.0,1800.0,45.0,296,4.0,24.0,1,22.0,5.0,MQ
1770785,-16.0,2219.0,75.0,585,4.0,8.0,2,12.0,,WN
681387,17.0,1457.0,129.0,957,5.0,22.0,7,7.0,7.0,NW
1909246,6.0,1941.0,37.0,190,3.0,16.0,3,10.0,,AA
1338544,16.0,1712.0,97.0,678,5.0,34.0,4,6.0,0.0,US


In [9]:
import categorical 

d = categorical.transform(d, "UniqueCarrier")

In [10]:
d.replace([np.inf, -np.inf], np.nan, inplace=True)
d.dropna(inplace=True)

In [11]:
y = d['ArrDelay']
X = d.drop('ArrDelay', axis = 'columns')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [12]:
#%pip install -U scikit-learn


In [13]:
#%pip install auto-sklearn

In [14]:
import autosklearn.regression
#ask.classification.AutoSklearnClassifier
#ask.regression.AutoSklearnRegressor() for regression tasks
model = autosklearn.regression.AutoSklearnRegressor(ensemble_size=10, #size of the end ensemble (minimum is 1)
                                                 time_left_for_this_task=120, #the number of seconds the process runs for
                                                 per_run_time_limit=30) #maximum seconds allocated per model
model.fit(X_train, y_train) #begin fitting the search model
print(model.sprint_statistics()) #print statistics for the search
y_predictions = model.predict(X_test) #get predictions from the model

auto-sklearn results:
  Dataset name: c658396e-c9fe-11eb-bac5-03f13631734a
  Metric: r2
  Best validation score: 0.962278
  Number of target algorithm runs: 48
  Number of successful target algorithm runs: 46
  Number of crashed target algorithm runs: 2
  Number of target algorithms that exceeded the time limit: 0
  Number of target algorithms that exceeded the memory limit: 0



In [None]:
y = d['ArrDelay']
X = d.drop('ArrDelay', axis = 'columns')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

model = MyAutoMLClassifier()
model.fit(X_train,y_train)

In [None]:
balanced_accuracy_score(y_test, model.predict(X_test))

In [None]:
model.best_pipeline