In [None]:
run src/preprocessing.py

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
mean_sale_price = target_1.mean()
naive_guess = np.ones(len(target_1))*mean_sale_price

In [None]:
naive_r2 = r2_score(target_1, naive_guess)
naive_rmse = np.sqrt(mean_squared_error(target_1, naive_guess))
naive_mae = mean_absolute_error(target_1, naive_guess)

#### Fundamental Question: How much does a home in Ames, Iowa sell for?

In [None]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import train_test_split
from tqdm import tqdm 
from time import time

In [None]:
def sample_training_set(X_train, y_train, n_pcnt):
    n = X_train.shape[0]*n_pcnt//100
    return n, X_train[:n], y_train[:n]

def time_function_call(function_call):
    start = time()
    result = function_call
    execution_time = time() - start
    return result, execution_time

def run_model(model, model_name, n_pcnt, data, labels):

    X_train, X_test, y_train, y_test = train_test_split(data, labels, random_state=42)

    
    n, X_samp, y_samp = sample_training_set(X_train, y_train, n_pcnt)
    
    _, fit_time = time_function_call(
        model.fit(X_samp, y_samp))
    
    train_pred, train_pred_time = time_function_call(
        model.predict(X_samp))
    
    test_pred, test_pred_time = time_function_call(
        model.predict(X_test))    
    
    return {
            'model_name' : model_name,
            'n_pcnt' : n_pcnt,
            'n' : n, 
            'rmse_train' : np.sqrt(mean_squared_error(y_samp, train_pred)),
            'rmse_test' : np.sqrt(mean_squared_error(y_test, test_pred)),
            'mae_train' : mean_absolute_error(y_samp, train_pred),
            'mae_test' : mean_absolute_error(y_test, test_pred),
            'r2_train_score' : model.score(X_samp, y_samp),
            'r2_test_score' : model.score(X_test, y_test),
            'fit_time' : fit_time,
            'train_pred_time' : train_pred_time,
            'test_pred_time' : test_pred_time}

In [None]:
dataset_2.shape, target_2.shape

In [None]:
test_results = {}
percentages = [1,2,3,4,5,7,10,15,20,25,30,40,50,60,70,80,90,100]
for n in tqdm(percentages):
    test_results[n] = run_model(Lasso(), 'Lasso', n,
                                dataset_2,
                                target_2)


In [None]:
test_results = pd.DataFrame(test_results).T.sort_values('n')
test_results

In [None]:
plt.plot(test_results.n, test_results.r2_test_score, label='test performance')
plt.plot(test_results.n, test_results.r2_train_score, label='train performance')
plt.legend()