In [None]:
# default_exp core

# Core library

> Helper functions used throughout the lessons

In [None]:
# export
import pandas as pd
from nbdev.showdoc import *
import os
import gdown
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Datasets

In [None]:
# export
def download_dataset(dataset_name: str):
    """Download datasets from Google Drive."""

    name_to_id = {
        "tau_decays_train.csv": "1iVSiZ0jLwf2_wi4Cyk2bU9O99RoTl4QF",
        "tau_decays_test.csv": "1t9HNGjlP9Zt0PDYZjJNu90BgRymvbaT8",
        "tau_decays_check_agreement.csv": "12jD2rxJUTJ5O9Wt457B_njl0ZxQTFKTG",
        "tau_decays_check_correlation.csv": "18E7v81v5iCY86PwLeOWxQxZdsPcCEXQE"
    }
    
    path = "../data/"
    os.makedirs(path, exist_ok=True)
    gdrive_path = "https://drive.google.com/uc?id="
    if dataset_name in name_to_id:
        if os.path.exists(path + dataset_name):
            print(
                f"Dataset already exists at '{path + dataset_name}' and is not downloaded again."
            )
            return
        try:
            file_url = gdrive_path + name_to_id[dataset_name]
            gdown.download(file_url, path + dataset_name, quiet=True)
        except Exception as e:
            print("Something went wrong during the download! Try again.")
            raise e
        print(f"Download of {dataset_name} dataset complete.")
    else:
        raise KeyError("File not on Google Drive.")    

### Flavours of physics: finding τ → μμμ

Train, test and evaluation datasets for Kaggle's flavour of physics challenge (link [here](https://www.kaggle.com/c/flavours-of-physics/overview)):

In [None]:
download_dataset("tau_decays_train.csv")

Download of tau_decays_train.csv dataset complete.


In [None]:
download_dataset("tau_decays_test.csv")

Download of tau_decays_test.csv dataset complete.


In [None]:
download_dataset("tau_decays_check_agreement.csv")

Download of tau_decays_check_agreement.csv dataset complete.


In [None]:
download_dataset("tau_decays_check_correlation.csv")

Download of tau_decays_check_correlation.csv dataset complete.


## Data wrangling

In [None]:
# export
def display_large(df):
    """Displays up to 1000 columns and rows of pandas.DataFrame or pandas.Series objects."""
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000):
        display(df)

In [None]:
# export
def rf_feature_importance(fitted_model, df):
    "Creates a pandas.Dataframe of a Random Forest's feature importance per column."
    return pd.DataFrame(
        {"Column": df.columns, "Importance": fitted_model.feature_importances_}
    ).sort_values("Importance", ascending=False)

## Data visualisation

In [None]:
# export
def plot_feature_importance(feature_importance):
    fig, ax = plt.subplots(figsize=(12,8))
    return sns.barplot(y="Column", x="Importance", data=feature_importance, color="b")