In [11]:
from typing_extensions import Literal
import autosklearn.classification
import autosklearn.regression
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics

import pandas as pd

Defining RMSE and F1 score for scoring

In [2]:
from sklearn.metrics import mean_squared_error, f1_score

Defining selection of scoring function and target

In [3]:
def select_config(dataset: Literal["college", "phishing"]):
    """Decides according to instructions what is target and scorer, trows error if unknown dataset"""
    # For College Dataset, Goal: RMSE, we need to build own scorer, since only MSE exists per string default
    if dataset == "college":
        target_name = "percent_pell_grant"
        used_scorer = lambda x,y: mean_squared_error(x, y, squared=False)
    # For Phising Dataset Goal: F1 score
    elif dataset == "phishing":
        target_name = "Result"
        used_scorer = f1_score
    else:
        raise NameError("We don't got this dataset!")
    return (target_name, used_scorer)

Generating train test split

In [37]:
def generate_data(dataset: Literal["college", "phishing"]):
    target_name, used_scorer = select_config(dataset)
    df = pd.read_csv(f"{dataset}.csv", quotechar="'", na_values="?") # , na_values="?"
    if dataset == "college":
        to_drop = ["school_name", "city", "zip", "school_webpage"]
        to_change = ["state", "predominant_degree","highest_degree","ownership","region","gender","carnegie_basic_classification","carnegie_undergraduate","carnegie_size","religious_affiliation"]
        df.drop(to_drop, axis=1, inplace=True)
        df = pd.get_dummies(df, prefix=to_change)
    X = df.drop(target_name, axis=1)
    y = df[target_name]
    return sklearn.model_selection.train_test_split(X, y)

Using defined functions for data and scorer

In [47]:
X_train, X_test, y_train, y_test = generate_data("phishing")
_, scorer = select_config("phishing")

Using AutoML

In [6]:
automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=600)
automl.fit(X_train, y_train)
y_hat = automl.predict(X_test)
print("Accuracy score", scorer(y_test, y_hat))

Accuracy score 0.9670821348673698


In [48]:
automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=600, seed=2)
automl.fit(X_train, y_train)
y_hat = automl.predict(X_test)
print("Accuracy score", scorer(y_test, y_hat))

Accuracy score 0.9724182168056447


In [49]:
automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=600, seed=3)
automl.fit(X_train, y_train)
y_hat = automl.predict(X_test)
print("Accuracy score", scorer(y_test, y_hat))

Accuracy score 0.9724535554131967


Doing same for the college

No support for strings (objects), see in generate_data function the according columns

Using One Hot Encoding for categorical and removin colums having more than 100 nunique values

In [26]:
for x in to_change:
    print(f"{x}: {len(X_train[x].unique())}")

school_name: 5235
city: 2083
state: 56
zip: 4674
school_webpage: 4336
predominant_degree: 4
highest_degree: 5
ownership: 3
region: 10
gender: 3
carnegie_basic_classification: 34
carnegie_undergraduate: 14
carnegie_size: 18
religious_affiliation: 53


In [45]:
X_train, X_test, y_train, y_test = generate_data("college")
_, scorer = select_config("college")

In [42]:
automl2 = autosklearn.regression.AutoSklearnRegressor(time_left_for_this_task=600)
automl2.fit(X_train, y_train)
y_hat = automl2.predict(X_test)
print("Accuracy score", scorer(y_test, y_hat))

Accuracy score 0.14321043753742396


In [43]:
automl2 = autosklearn.regression.AutoSklearnRegressor(time_left_for_this_task=600, seed=2)
automl2.fit(X_train, y_train)
y_hat = automl2.predict(X_test)
print("Accuracy score", scorer(y_test, y_hat))

Accuracy score 0.14317674337127595


In [46]:
automl2 = autosklearn.regression.AutoSklearnRegressor(time_left_for_this_task=600, seed=4)
automl2.fit(X_train, y_train)
y_hat = automl2.predict(X_test)
print("Accuracy score", scorer(y_test, y_hat))

Accuracy score 0.14915458041167518
