In [1]:
from typing_extensions import Literal
import autosklearn.classification
import autosklearn.regression
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics

import pandas as pd

Defining RMSE and F1 score for scoring

In [2]:
from sklearn.metrics import mean_squared_error, f1_score

Defining selection of scoring function and target

In [3]:
def select_config(dataset: Literal["college", "phishing"]):
    """Decides according to instructions what is target and scorer, trows error if unknown dataset"""
    # For College Dataset, Goal: RMSE, we need to build own scorer, since only MSE exists per string default
    if dataset == "college":
        target_name = "percent_pell_grant"
        used_scorer = lambda x,y: mean_squared_error(x, y, squared=False)
    # For Phising Dataset Goal: F1 score
    elif dataset == "phishing":
        target_name = "Result"
        used_scorer = f1_score
    else:
        raise NameError("We don't got this dataset!")
    return (target_name, used_scorer)

Generating train test split

In [40]:
def generate_data(dataset: Literal["college", "phishing"]):
    target_name, used_scorer = select_config(dataset)
    train = pd.read_csv(f"{dataset}_train.csv", quotechar="'", na_values="?") # , na_values="?"
    test = pd.read_csv(f"{dataset}_test.csv", quotechar="'", na_values="?")
    if dataset == "college":
        to_drop = ["school_name", "city", "zip", "school_webpage"]
        to_change = ["state", "predominant_degree","highest_degree","ownership","region","gender","carnegie_basic_classification","carnegie_undergraduate","carnegie_size","religious_affiliation"]
        train["dstype"] = "train"
        test["dstype"] = "test"
        combined = pd.concat([train, test])
        is_train = combined['dstype'] == "train"
        combined.drop(['dstype'], axis=1, inplace=True)
        combined.drop(to_drop, axis=1, inplace=True)
        combined = pd.get_dummies(combined, prefix=to_change)
        train = combined.loc[is_train]
        test = combined.loc[[not x for x in is_train]]
    X_train = train.drop(target_name, axis=1)
    y_train = train[target_name]
    X_test = test.drop(target_name, axis=1)
    y_test = test[target_name]
    return (X_train, X_test, y_train, y_test)

Using defined functions for data and scorer

In [37]:
X_train, X_test, y_train, y_test = generate_data("phishing")
_, scorer = select_config("phishing")

Using AutoML

In [14]:
automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=600)
automl.fit(X_train, y_train)
y_hat = automl.predict(X_test)
print("Accuracy score", scorer(y_test, y_hat))

Accuracy score 0.9366492146596859


In [15]:
automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=600, seed=2)
automl.fit(X_train, y_train)
y_hat = automl.predict(X_test)
print("Accuracy score", scorer(y_test, y_hat))

Accuracy score 0.9335078534031415


In [16]:
automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=600, seed=3)
automl.fit(X_train, y_train)
y_hat = automl.predict(X_test)
print("Accuracy score", scorer(y_test, y_hat))

Accuracy score 0.9291338582677166


Doing same for the college

No support for strings (objects), see in generate_data function the according columns

Using One Hot Encoding for categorical and removin colums having more than 100 nunique values

In [24]:
colege_train = pd.read_csv(f"college_train.csv", quotechar="'", na_values="?") # , na_values="?"
colege_test = pd.read_csv(f"college_test.csv", quotechar="'", na_values="?")
for x in ["school_name", "city", "zip", "school_webpage", "state", "predominant_degree","highest_degree","ownership","region","gender","carnegie_basic_classification","carnegie_undergraduate","carnegie_size","religious_affiliation"]:
    print(f"{x}: {len(colege_train[x].unique())} | {len(colege_test[x].unique())}")

school_name: 4890 | 2091
city: 2088 | 991
zip: 4557 | 1849
school_webpage: 4396 | 1471
state: 59 | 53
predominant_degree: 4 | 4
highest_degree: 5 | 5
ownership: 3 | 3
region: 10 | 9
gender: 3 | 2
carnegie_basic_classification: 34 | 30
carnegie_undergraduate: 14 | 13
carnegie_size: 18 | 16
religious_affiliation: 56 | 20


In [41]:
X_train, X_test, y_train, y_test = generate_data("college")
_, scorer = select_config("college")

In [44]:
automl2 = autosklearn.regression.AutoSklearnRegressor(time_left_for_this_task=600)
automl2.fit(X_train, y_train)
y_hat = automl2.predict(X_test)
print("Accuracy score", scorer(y_test, y_hat))

Accuracy score 0.19742449103953208


In [45]:
automl2 = autosklearn.regression.AutoSklearnRegressor(time_left_for_this_task=600, seed=2)
automl2.fit(X_train, y_train)
y_hat = automl2.predict(X_test)
print("Accuracy score", scorer(y_test, y_hat))

Accuracy score 0.19675482738849895


In [46]:
automl2 = autosklearn.regression.AutoSklearnRegressor(time_left_for_this_task=600, seed=4)
automl2.fit(X_train, y_train)
y_hat = automl2.predict(X_test)
print("Accuracy score", scorer(y_test, y_hat))

Accuracy score 0.1972553352485255
