In [None]:
RESOURCES_PATH = '../../../resources'

In [None]:
MAX_EPOCHS = 1000

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, log_loss, make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

## Load data

In [None]:
df = pd.read_csv(f'{RESOURCES_PATH}/dataset/budget/cleared.tsv', sep='\t')
orig_df = pd.read_csv(f'{RESOURCES_PATH}/dataset/budget/original.tsv', sep='\t')

df.fillna('None', inplace=True)
orig_df.fillna('None', inplace=True)

print(f'Datset length: {len(df)}')
df.head()

## Prepare data

In [None]:
le = LabelEncoder()
le.classes_ = np.array(Path(f'{RESOURCES_PATH}/dataset/budget/targets.txt').read_text().split('\n'))

In [None]:
def to_vector(df):
    x = df[['object', 'financing', 'project']]
    y = df.budget

    x = pd.get_dummies(x)
    y = le.transform(y)

    return x, y

In [None]:
x, y = to_vector(df)
orig_x, orig_y = to_vector(orig_df)

x.head()

## Train

In [None]:
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'saga'],
    'class_weight': [None, 'balanced'],
    'dual': [False, True],
    'max_iter': [MAX_EPOCHS],
    'random_state': [42]
}

model_search_cv = GridSearchCV(
    LogisticRegression(), 
    param_grid,
    scoring={
        'neg_log_loss': make_scorer(log_loss, needs_proba=True, labels=y, greater_is_better=False), 
        'accuracy': make_scorer(accuracy_score)
    },
    refit='neg_log_loss',
    cv=StratifiedKFold(10, shuffle=True, random_state=42),
    n_jobs=-1,
    verbose=1
)

In [None]:
model_search_cv.fit(x, y);

#### Show Seach CV Results

In [None]:
cv_result_df = pd.DataFrame(model_search_cv.cv_results_)

cv_result_df = cv_result_df[['param_class_weight', 'param_dual', 'param_max_iter', 
                             'param_penalty', 'param_solver', 'mean_fit_time', 
                             'mean_test_neg_log_loss', 'mean_test_accuracy', 'std_test_accuracy']]
cv_result_df.dropna(subset=['mean_test_neg_log_loss'], inplace=True)

cv_result_df.sort_values(by=['mean_test_neg_log_loss', 'mean_test_accuracy'], ascending=False)

In [None]:
model = model_search_cv.best_estimator_
model_i = model_search_cv.best_index_

## Evaluation

In [None]:
def accuracy_report(y_true, y_pred_proba):
    return round(accuracy_score(y_true, y_pred_proba.argmax(axis=1)), 4)

def logloss_report(y_true, y_pred_proba):
    return round(log_loss(y_true, y_pred_proba), 4)

def to_orig_shape(y_pred_proba):
    shape_diff = len(le.classes_) - y_pred_proba.shape[1] - 1
    return np.pad(y_pred_proba, ((0, 0), (0, shape_diff)), 'constant', constant_values=(0, 0))

In [None]:
orig_y_pred_proba = to_orig_shape(model.predict_proba(orig_x))

In [None]:
report = {
    'Name': f'Logistic Regression',
    '[Cleared CV] Accuracy': cv_result_df.mean_test_accuracy[model_i],
    '[Cleared CV] Log Loss': -cv_result_df.mean_test_neg_log_loss[model_i],
    '[Original] Accuracy': accuracy_report(orig_y, orig_y_pred_proba),
    '[Original] Log Loss': logloss_report(orig_y, orig_y_pred_proba),
    'Training time (sec)': cv_result_df.mean_fit_time[model_i]
}

report_df = pd.DataFrame(report, index=['']).T

report_df.to_csv('reports/logreg.tsv', sep='\t', header=False)

report_df