In [1]:
import sys
sys.path.append("/home/jarlehti/projects/gradu")

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint

from src.utils.data_utils import transform_for_classification
from scripts.constants import TEST_DATASETS_FOR_DATASET, TARGET_COLUMNS_FOR_DATASET
from scripts.base_lr import run_logistic_regression_on_2d
from scripts.base_clf import run_classification
from sklearn.dummy import DummyClassifier 

In [3]:
CURRENT_FOLDER = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
RESULTS_FOLDER = os.path.join(CURRENT_FOLDER, "results")
PLOTS_FOLDER = os.path.join(CURRENT_FOLDER, "plots")

In [4]:
def get_datasets_data(dataset_name):

    train_df_path = os.path.join(CURRENT_FOLDER, f"data/datasets/{dataset_name}.csv")
    train_df = pd.read_csv(train_df_path)

    test_df_path = TEST_DATASETS_FOR_DATASET[dataset_name]
    test_df = pd.read_csv(os.path.join(CURRENT_FOLDER, test_df_path))
    
    return train_df, test_df

In [9]:
def run_clf(dataset_name, train_df, test_df):
    
    results = []
    
    train_df_transformed = transform_for_classification(dataset_name, train_df)    
    test_df_transformed = transform_for_classification(dataset_name, test_df)

    assert set(list(train_df_transformed.columns.values)).symmetric_difference(
        set(list(test_df_transformed.columns.values))) == set()
    
    target_column: str = TARGET_COLUMNS_FOR_DATASET[dataset_name]
    scores = run_classification(train_df_transformed, test_df_transformed, target_column)
    
    return scores

In [12]:
dataset_name = 'adult_no_discretization'
train_df, test_df = get_datasets_data(dataset_name)
no_discretization_scores = run_clf(dataset_name, train_df, test_df)

Model: DummyClassifier 	 Accuracy: 0.754 (0.000), Balanced accuracy: 0.500 (0.000), F1: 0.000 (0.000)
Model: GradientBoostingClassifier 	 Accuracy: 0.859 (0.000), Balanced accuracy: 0.764 (0.000), F1: 0.668 (0.000)
Model: LGBMClassifier 	 Accuracy: 0.863 (0.000), Balanced accuracy: 0.785 (0.000), F1: 0.694 (0.000)
Model: XGBClassifier 	 Accuracy: 0.863 (0.000), Balanced accuracy: 0.785 (0.000), F1: 0.694 (0.000)
Model: RandomForestClassifier 	 Accuracy: 0.833 (0.000), Balanced accuracy: 0.755 (0.000), F1: 0.639 (0.000)




Model: LinearSVC 	 Accuracy: 0.812 (0.000), Balanced accuracy: 0.742 (0.000), F1: 0.612 (0.000)
Model: MLPClassifier 	 Accuracy: 0.806 (0.000), Balanced accuracy: 0.720 (0.000), F1: 0.582 (0.000)


In [13]:
dataset_name = 'adult_low_discretization'
train_df, test_df = get_datasets_data(dataset_name)
low_discretization_scores = run_clf(dataset_name, train_df, test_df)

Model: DummyClassifier 	 Accuracy: 0.754 (0.000), Balanced accuracy: 0.500 (0.000), F1: 0.000 (0.000)
Model: GradientBoostingClassifier 	 Accuracy: 0.836 (0.000), Balanced accuracy: 0.737 (0.000), F1: 0.620 (0.000)
Model: LGBMClassifier 	 Accuracy: 0.835 (0.000), Balanced accuracy: 0.746 (0.000), F1: 0.629 (0.000)
Model: XGBClassifier 	 Accuracy: 0.833 (0.000), Balanced accuracy: 0.746 (0.000), F1: 0.629 (0.000)
Model: RandomForestClassifier 	 Accuracy: 0.823 (0.000), Balanced accuracy: 0.733 (0.000), F1: 0.607 (0.000)
Model: LinearSVC 	 Accuracy: 0.836 (0.000), Balanced accuracy: 0.739 (0.000), F1: 0.622 (0.000)
Model: MLPClassifier 	 Accuracy: 0.836 (0.000), Balanced accuracy: 0.740 (0.000), F1: 0.622 (0.000)




In [14]:
dataset_name = 'adult_high_discretization'
train_df, test_df = get_datasets_data(dataset_name)
high_discretization_scores = run_clf(dataset_name, train_df, test_df)

Model: DummyClassifier 	 Accuracy: 0.754 (0.000), Balanced accuracy: 0.500 (0.000), F1: 0.000 (0.000)
Model: GradientBoostingClassifier 	 Accuracy: 0.830 (0.000), Balanced accuracy: 0.735 (0.000), F1: 0.613 (0.000)
Model: LGBMClassifier 	 Accuracy: 0.830 (0.000), Balanced accuracy: 0.742 (0.000), F1: 0.622 (0.000)
Model: XGBClassifier 	 Accuracy: 0.829 (0.000), Balanced accuracy: 0.734 (0.000), F1: 0.612 (0.000)
Model: RandomForestClassifier 	 Accuracy: 0.826 (0.000), Balanced accuracy: 0.729 (0.000), F1: 0.604 (0.000)
Model: LinearSVC 	 Accuracy: 0.829 (0.000), Balanced accuracy: 0.724 (0.000), F1: 0.597 (0.000)
Model: MLPClassifier 	 Accuracy: 0.831 (0.000), Balanced accuracy: 0.734 (0.000), F1: 0.612 (0.000)


