In [None]:
cd .. 

In [None]:
run __init__.py

In [None]:
run src/load_data.py

In [None]:
from sklearn.metrics import accuracy_score, f1_score

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from tqdm import tqdm 
from time import time

In [None]:
adult_train_df = data['adult']['train']['engineered']
adult_train_target = data['adult']['train']['labels']

In [None]:
def sample_training_set(X_train, y_train, n_pcnt):
    n = X_train.shape[0]*n_pcnt//100
    return n, X_train[:n], y_train[:n]

def time_function_call(function_call):
    start = time()
    result = function_call
    execution_time = time() - start
    return result, execution_time

def run_model(model, model_name, n_pcnt, data, labels):

    X_train, X_test, y_train, y_test = train_test_split(data, labels, stratify=labels)

    
    n, X_samp, y_samp = sample_training_set(X_train, y_train, n_pcnt)
    
    _, fit_time = time_function_call(
        model.fit(X_samp, y_samp))
    
    train_pred, train_pred_time = time_function_call(
        model.predict(X_samp))
    
    test_pred, test_pred_time = time_function_call(
        model.predict(X_test))    
    
    return {
            'model' : model,
            'model_name' : model_name,
            'n_pcnt' : n_pcnt,
            'n' : n,
            'f1_train_score' : f1_score(y_samp, train_pred),
            'f1_test_score' : f1_score(y_test, test_pred),
            'accuracy_train_score' : model.score(X_samp, y_samp),
            'accuracy_test_score' : model.score(X_test, y_test),
            'fit_time' : fit_time,
            'train_pred_time' : train_pred_time,
            'test_pred_time' : test_pred_time}

## Variable Ranking - by Single Feature F$_1$ Score 

In [None]:
test_scores = []
for feature in adult_train_df.columns:
    results = run_model(LogisticRegression(), 'variable ranking', 50, adult_train_df[[feature]], adult_train_target)
    test_score = results['f1_test_score']
    if test_score > 0:
        test_scores.append({'feature': feature, 'score' : test_score})


In [None]:
results = pd.DataFrame(test_scores).sort_values('score', ascending=False)
results

In [None]:
performant_features = list(results.feature.values)
performant_features

In [None]:
features_to_test = []
test_results = {}
for feature in performant_features:
    features_to_test.append(feature)
    test_results[feature] = run_model(LogisticRegression(), 'logit', 100,
                                      adult_train_df[features_to_test],
                                      adult_train_target)

In [None]:
test_results = pd.DataFrame(test_results).T.sort_values('n')
test_results

In [None]:
plt.plot(range(len(features_to_test)), test_results.f1_test_score, label='test performance')
plt.plot(range(len(features_to_test)), test_results.f1_train_score, label='train performance')
plt.legend()

## Variable-Ranking - By Regression Coefficient in Full Model

In [None]:
results = run_model(LogisticRegression(), 'logit', 100,
                    adult_train_df,
                    adult_train_target)

In [None]:
results

In [None]:
logistic_regression_model = results['model']

In [None]:
coefficients = logistic_regression_model.coef_
features = adult_train_df.columns
coefficients = pd.Series(coefficients.T.ravel(), index=features)
coefficients.head()

In [None]:
sorted_coefs = np.abs(coefficients).sort_values(ascending=False)
sorted_coefs.head(20)

In [None]:
performant_features = list(list(sorted_coefs.head(20).index))
performant_features

In [None]:
features_to_test = []
test_results = {}
for feature in performant_features:
    print(adult_train_df[features_to_test].shape)
    features_to_test.append(feature)
    test_results[feature] = run_model(LogisticRegression(), 'logit', 100,
                                      adult_train_df[features_to_test],
                                      adult_train_target)

In [None]:
test_results = pd.DataFrame(test_results).T.sort_values('n')

plt.plot(range(len(features_to_test)), test_results.f1_test_score, label='test performance')
plt.plot(range(len(features_to_test)), test_results.f1_train_score, label='train performance')
plt.legend()

In [None]:
features_to_test = ['age', 'capital-gain','capital-loss','hours-per-week']
test_results = {}
for feature in performant_features:
    print(adult_train_df[features_to_test].shape)
    features_to_test.append(feature)
    test_results[feature] = run_model(LogisticRegression(), 'logit', 100,
                                      adult_train_df[features_to_test],
                                      adult_train_target)

In [None]:
test_results = pd.DataFrame(test_results).T.sort_values('n')

plt.plot(range(len(features_to_test)-4), test_results.f1_test_score, label='test performance')
plt.plot(range(len(features_to_test)-4), test_results.f1_train_score, label='train performance')
plt.legend()