In [None]:
# default_exp core

# Core library

> Helper functions used throughout the lessons.

In [None]:
# export
from nbdev.showdoc import *
import wget
import os
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from pandas.api.types import is_object_dtype, is_numeric_dtype
import seaborn as sns
import scipy
from scipy.cluster import hierarchy as hc
import matplotlib.pyplot as plt

## Datasets

In [None]:
# exports
def get_dataset(dataset_name: str):
    """
    Download datasets from Google Drive.
    """

    name_to_id = {
        "word2vec-google-news-300.pkl": "1dRwSXbFTcQbn8c3V24G92wFY4DXZ1SDt",
        "imdb.csv": "1wF0YEmQOwceJz2d6w4CfhBgydU87dPGl",
        "housing.csv": "1d7oOKdDmZFx8wf0c8OfuTW1FpUyJHABh",
        "housing_gmaps_data_raw.csv": "1R1RUHAXxzrIngRJMFwyp4vZRVICd-I6T",
        "housing_addresses.csv": "1mOK0uyRz5Zs-Qo7mVMlxwtb2xn1E6N9Q",
        "housing_merged.csv": "1bdYuBtIPrKiU-ut2MeSSsL47onPtZrRt",
        "housing_processed.csv": "12PxnWhPg_Pj0yx75vD22gwfdkkx80E6_",
        "churn.csv": "1-IO-JQr7tjQGIKZyo_SyupCpX2VNDQIf",
    }

    path = "../data/"
    gdrive_path = "https://docs.google.com/uc?export=download&id="
    if dataset_name in name_to_id:
        if os.path.exists(path + dataset_name):
            print(
                f"Dataset already exists at '{path + dataset_name}' and is not downloaded again."
            )
            return
        try:
            file_url = gdrive_path + name_to_id[dataset_name]
            wget.download(file_url, out=path)
        except Exception as e:
            print("Something went wrong during download. Try again.")
            raise e
        print(f"Download of {dataset_name} dataset complete.")
    else:
        raise KeyError("File not on Google Drive.")

### California Housing Prices
This dataset from Kaggle ([link](https://www.kaggle.com/camnugent/california-housing-prices)) is used in the second chapter of Aurélien Géron's recent book *Hands-On Machine learning with Scikit-Learn and TensorFlow*.

#### `housing.csv`
This dataset pertains to the houses found in a given California district and some summary stats about them based on the 1990 census data.

In [None]:
get_dataset("housing.csv")

Download of housing.csv dataset complete.


#### `housing_gmaps_data_raw.csv`
This dataset contains the raw outputs of the addresses associated with the coordinates in the `housing.csv` dataset as retrieved with the Google Maps API. 

In [None]:
get_dataset("housing_gmaps_data_raw.csv")

Download of housing_gmaps_data_raw.csv dataset complete.


#### `housing_addresses.csv`
The `housing_addresses.csv` dataset is a cleaned subset of the `housing_gmaps_data_raw.csv` dataset.

In [None]:
get_dataset("housing_addresses.csv")

Download of housing_addresses.csv dataset complete.


#### housing_merged.csv

The merge of `housing.csv` and `housing_addresses.csv` from lesson 2.

In [None]:
get_dataset("housing_merged.csv")

Download of housing_merged.csv dataset complete.


#### housing_processed.csv

The processed version of `housing_merged.csv` with no missing values and categorical columns encoded numerically.

In [None]:
get_dataset("housing_processed.csv")

Download of housing_processed.csv dataset complete.


### IMDB

#### `imdb.csv`
The IMDB dataset is available on Kaggle ([link](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)). This is a dataset for binary sentiment classification and provides a set of 25,000 highly polar movie reviews for training and 25,000 for testing.

In [None]:
get_dataset("imdb.csv")

Download of imdb.csv dataset complete.


### Word2Vec embeddings

#### `word2vec-google-news-300.pkl`

Pre-trained vectors trained on a part of the Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases. The phrases were obtained using a simple data-driven approach described in *Distributed Representations of Words and Phrases and their Compositionality*. This dataset is available from GENSIM ([link](https://github.com/RaRe-Technologies/gensim-data)).

In [None]:
get_dataset("word2vec-google-news-300.pkl")

Download of word2vec-google-news-300.pkl dataset complete.


### Customer churn

[IBM's telecommunications dataset](https://www.kaggle.com/blastchar/telco-customer-churn) on customer churn. The dataset includes information about:

* Customers who left within the last month – the column is called `Churn`
* Services that each customer has signed up for – phone, multiple lines, internet, online security, online backup, device protection, tech support, and streaming TV and movies
* Customer account information – how long they’ve been a customer (tenure), contract, payment method, paperless billing, monthly charges, and total charges
* Demographic info about customers – gender, whether they're a senior citizen or not, and if they have partners and dependents

In [None]:
get_dataset("churn.csv")

Download of churn.csv dataset complete.


## Metrics

In [None]:
# exports
def rmse(y, yhat):
    """A utility function to calculate the Root Mean Square Error (RMSE).
    
    Args:
        y (array): Actual values for target.
        yhat (array): Predicted values for target.
        
    Returns:
        rmse (double): The RMSE.
    """
    return np.sqrt(mean_squared_error(y, yhat))

In [None]:
y = np.array([2, 2, 3])
yhat = np.array([0, 2, 6])
rmse(y, yhat)

2.0816659994661326

## Data wrangling

In [None]:
# exports
def convert_strings_to_categories(df):
    """A utility function to convert all string columns to Categorical data type."""
    for col in df.columns:
        if is_object_dtype(df[col]):
            df[col] = df[col].astype("category")

In [None]:
df = pd.DataFrame(
    {"string_column": ["apple", "banana", "orange"], "numerical_column": [0, 2, 4]}
)
df.head()

Unnamed: 0,string_column,numerical_column
0,apple,0
1,banana,2
2,orange,4


In [None]:
df.dtypes

string_column       object
numerical_column     int64
dtype: object

In [None]:
convert_strings_to_categories(df)

In [None]:
df.dtypes

string_column       category
numerical_column       int64
dtype: object

In [None]:
df["string_column"].cat.categories

Index(['apple', 'banana', 'orange'], dtype='object')

In [None]:
df["string_column"].cat.codes

0    0
1    1
2    2
dtype: int8

In [None]:
# exports
def fill_missing_values_with_median(df):
    """Replaces missing values in numerical columns with the median."""
    for column in df.columns:
        if is_numeric_dtype(df[column]):
            if pd.isnull(df[column]).sum():
                column_median = df[column].median()
                df[column].fillna(column_median, inplace=True)

In [None]:
df = pd.DataFrame(
    [[np.nan, 2, 0], [3, 4, 1], [np.nan, np.nan, 5], [np.nan, 3, 4],],
    columns=list("ABC"),
)

df

Unnamed: 0,A,B,C
0,,2.0,0
1,3.0,4.0,1
2,,,5
3,,3.0,4


In [None]:
fill_missing_values_with_median(df)

In [None]:
df

Unnamed: 0,A,B,C
0,3.0,2.0,0
1,3.0,4.0,1
2,3.0,3.0,5
3,3.0,3.0,4


In [None]:
# exports
def display_large(df):
    """Displays up to 1000 columns and rows of pandas.DataFrame or pandas.Series objects."""
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000):
        display(df)

In [None]:
data = [{"col_" + str(k): v for k, v in enumerate(range(100))}]
df = pd.DataFrame(data)

In [None]:
df.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_90,col_91,col_92,col_93,col_94,col_95,col_96,col_97,col_98,col_99
0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99


In [None]:
display_large(df)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20,col_21,col_22,col_23,col_24,col_25,col_26,col_27,col_28,col_29,col_30,col_31,col_32,col_33,col_34,col_35,col_36,col_37,col_38,col_39,col_40,col_41,col_42,col_43,col_44,col_45,col_46,col_47,col_48,col_49,col_50,col_51,col_52,col_53,col_54,col_55,col_56,col_57,col_58,col_59,col_60,col_61,col_62,col_63,col_64,col_65,col_66,col_67,col_68,col_69,col_70,col_71,col_72,col_73,col_74,col_75,col_76,col_77,col_78,col_79,col_80,col_81,col_82,col_83,col_84,col_85,col_86,col_87,col_88,col_89,col_90,col_91,col_92,col_93,col_94,col_95,col_96,col_97,col_98,col_99
0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99


In [None]:
# exports
def rf_feature_importance(fitted_model, df):
    return pd.DataFrame(
        {"Column": df.columns, "Importance": fitted_model.feature_importances_}
    ).sort_values("Importance", ascending=False)

## Data visualisation

In [None]:
# exports
def plot_feature_importance(feature_importance):
    fig, ax = plt.subplots(figsize=(12,8))
    return sns.barplot(y="Column", x="Importance", data=feature_importance, color="b")

In [None]:
# exports
def plot_dendogram(X):
    """Plots a dendogram to see which features are related."""
    # calculate correlation coefficient
    corr = np.round(scipy.stats.spearmanr(X).correlation, 4)
    # convert to distance matrix
    corr_condensed = hc.distance.squareform(1 - corr)
    # perform clustering
    z = hc.linkage(corr_condensed, method="average")
    # plot dendogram
    fig = plt.figure(figsize=(16, 10))
    dendrogram = hc.dendrogram(
        z, labels=X.columns, orientation="left", leaf_font_size=16
    )
    plt.show()