In [None]:
cd .. 

In [None]:
run __init__.py

In [None]:
run src/load_data.py

In [None]:
!pip install tqdm --quiet

In [None]:
from sklearn.metrics import accuracy_score, f1_score

# Fundamental Question: Who makes more than $50k?

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from tqdm import tqdm 
from time import time

In [None]:
def sample_training_set(X_train, y_train, n_pcnt):
    n = X_train.shape[0]*n_pcnt//100
    return n, X_train[:n], y_train[:n]

def time_function_call(function_call):
    start = time()
    result = function_call
    execution_time = time() - start
    return result, execution_time

def run_model(model, model_name, n_pcnt, data, labels):

    X_train, X_test, y_train, y_test = train_test_split(data, labels, random_state=42, stratify=labels)

    
    n, X_samp, y_samp = sample_training_set(X_train, y_train, n_pcnt)
    
    _, fit_time = time_function_call(
        model.fit(X_samp, y_samp))
    
    train_pred, train_pred_time = time_function_call(
        model.predict(X_samp))
    
    test_pred, test_pred_time = time_function_call(
        model.predict(X_test))    
    
    return {
            'model_name' : model_name,
            'n_pcnt' : n_pcnt,
            'n' : n,
            'f1_train_score' : f1_score(y_samp, train_pred),
            'f1_test_score' : f1_score(y_test, test_pred),
            'accuracy_train_score' : model.score(X_samp, y_samp),
            'accuracy_test_score' : model.score(X_test, y_test),
            'fit_time' : fit_time,
            'train_pred_time' : train_pred_time,
            'test_pred_time' : test_pred_time}

In [None]:
adult_train_df = data['adult']['train']['engineered']
adult_train_target = data['adult']['train']['labels']
(adult_train_df.shape, adult_train_target.shape)

In [None]:
test_results = {}
percentages = [1,2,3,4,5,7,10,15,20,25,30,40,50,60,70,80,90,100]
for n in tqdm(percentages):
    test_results[n] = run_model(LogisticRegression(), 'logit', n,
                                adult_train_df,
                                adult_train_target)


In [None]:
test_results = pd.DataFrame(test_results).T.sort_values('n')
test_results

In [None]:
plt.plot(test_results.n, test_results.f1_test_score, label='test performance')
plt.plot(test_results.n, test_results.f1_train_score, label='train performance')
plt.legend()