In [None]:
# default_exp core

# Core library

> Helper functions used throughout the lessons.

In [None]:
# export
from nbdev.showdoc import *
import wget
import os
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

## Datasets

In [None]:
# exports
def get_dataset(dataset_name: str):
    """
    Download datasets from Google Drive.
    """

    name_to_id = {
        "word2vec-google-news-300.pkl": "1dRwSXbFTcQbn8c3V24G92wFY4DXZ1SDt",
        "imdb.csv": "1wF0YEmQOwceJz2d6w4CfhBgydU87dPGl",
        "housing.csv": "1d7oOKdDmZFx8wf0c8OfuTW1FpUyJHABh",
        "housing_gmaps_data_raw.csv": "1R1RUHAXxzrIngRJMFwyp4vZRVICd-I6T",
        "housing_addresses.csv": "1mOK0uyRz5Zs-Qo7mVMlxwtb2xn1E6N9Q",
        "housing_merged.csv": "1bdYuBtIPrKiU-ut2MeSSsL47onPtZrRt",
        "housing_processed.csv": "12PxnWhPg_Pj0yx75vD22gwfdkkx80E6_"
    }

    path = '../data/'
    gdrive_path = "https://docs.google.com/uc?export=download&id="
    if dataset_name in name_to_id:
        if os.path.exists(path + dataset_name):
            print(f"Dataset already exists at '{path + dataset_name}' and is not downloaded again.")
            return
        try:
            file_url =  gdrive_path + name_to_id[dataset_name]
            wget.download(file_url, out=path)
        except Exception as e:
            print("Something went wrong during download. Try again.")
            raise e
        print(f"Download of {dataset_name} dataset complete.")
    else:
        raise KeyError("File not on Google Drive.")

### California Housing Prices
This dataset from Kaggle ([link](https://www.kaggle.com/camnugent/california-housing-prices)) is used in the second chapter of Aurélien Géron's recent book *Hands-On Machine learning with Scikit-Learn and TensorFlow*.

#### `housing.csv`
This dataset pertains to the houses found in a given California district and some summary stats about them based on the 1990 census data.

In [None]:
get_dataset('housing.csv')

Download of housing.csv dataset complete.


#### `housing_gmaps_data_raw.csv`
This dataset contains the raw outputs of the addresses associated with the coordinates in the `housing.csv` dataset as retrieved with the Google Maps API. 

In [None]:
get_dataset('housing_gmaps_data_raw.csv')

Download of housing_gmaps_data_raw.csv dataset complete.


#### `housing_addresses.csv`
The `housing_addresses.csv` dataset is a cleaned subset of the `housing_gmaps_data_raw.csv` dataset.

In [None]:
get_dataset('housing_addresses.csv')

Download of housing_addresses.csv dataset complete.


#### housing_merged.csv

The merge of `housing.csv` and `housing_addresses.csv` from lesson 2.

In [None]:
get_dataset('housing_merged.csv')

Download of housing_merged.csv dataset complete.


#### housing_processed.csv

The processed version of `housing_merged.csv` with no missing values and categorical columns encoded numerically.

In [None]:
get_dataset('housing_processed.csv')

Download of housing_processed.csv dataset complete.


### IMDB

#### `imdb.csv`
The IMDB dataset is available on Kaggle ([link](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)). This is a dataset for binary sentiment classification and provides a set of 25,000 highly polar movie reviews for training and 25,000 for testing.

In [None]:
get_dataset('imdb.csv')

Download of imdb.csv dataset complete.


### Word2Vec embeddings

#### `word2vec-google-news-300.pkl`

Pre-trained vectors trained on a part of the Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases. The phrases were obtained using a simple data-driven approach described in *Distributed Representations of Words and Phrases and their Compositionality*. This dataset is available from GENSIM ([link](https://github.com/RaRe-Technologies/gensim-data)).

In [None]:
get_dataset('word2vec-google-news-300.pkl')

Download of word2vec-google-news-300.pkl dataset complete.


## Metrics

In [None]:
# exports
def rmse(y, yhat):
    """A utility function to calculate the Root Mean Square Error (RMSE).
    
    Args:
        y (array): Actual values for target.
        yhat (array): Predicted values for target.
        
    Returns:
        rmse (double): The RMSE.
    """
    return np.sqrt(mean_squared_error(y, yhat))

In [None]:
y = np.array([2,2,3])
yhat = np.array([0,2,6])
rmse(y, yhat)

2.0816659994661326

In [None]:
# exports
def print_scores(fitted_model):
    res = {
        "RMSE on train:": rmse(fitted_model.predict(X_train), y_train),
        "R^2 on train:": fitted_model.score(X_train, y_train),
        "RMSE on valid:": rmse(fitted_model.predict(X_valid), y_valid),
        "R^2 on valid:": fitted_model.score(X_valid, y_valid),
    }
    if hasattr(fitted_model, "oob_score_"):
        res["OOB R^2:"] = fitted_model.oob_score_

    for k, v in res.items():
        print(k, round(v, 3))

In [None]:
X, y = load_boston(return_X_y=True)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
model = RandomForestRegressor(oob_score=True)

In [None]:
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=True,
                      random_state=None, verbose=0, warm_start=False)

In [None]:
print_scores(model)

RMSE on train: 1.267
R^2 on train: 0.981
RMSE on valid: 2.995
R^2 on valid: 0.856
OOB R^2: 0.862
