# Utility Functions

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import time
%matplotlib inline
from sklearn import preprocessing
import sklearn.feature_selection as selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn import metrics
from collections import defaultdict
import xgboost as xgb
import multiprocessing as mp
from joblib import Parallel, delayed



## Name Cleaning

In [2]:
def nameCleaning(df):
    # Custom cleaning
    df.columns = [re.sub("[\\. \\(\\)\\/]+", "_", elem) for elem in df.columns]
    df.columns = [re.sub("-", "_", elem) for elem in df.columns]
    df.columns = [re.sub("'", "", elem) for elem in df.columns]
    df.columns = [re.sub(",", "_", elem) for elem in df.columns]
    df.columns = [re.sub(":", "_", elem) for elem in df.columns]
    df.columns = [re.sub("<", "MIN", elem) for elem in df.columns]
    df.columns = [re.sub(">", "MAG", elem) for elem in df.columns]
    df.columns = [re.sub("&", "E", elem) for elem in df.columns]
    df.columns = [re.sub("Â°", "", elem) for elem in df.columns]
    df.columns = [re.sub("%", "PERC", elem) for elem in df.columns]
    df.columns = [re.sub("\\+", "_", elem) for elem in df.columns]
    # String upper
    df.columns = [elem.upper() for elem in df.columns]
    # Trim
    df.columns = [elem.strip() for elem in df.columns]
    # Cut recurring underscore
    df.columns = [re.sub("_+", "_", elem) for elem in df.columns]
    return(df)

## Data Exploration

### Data Visualization

#### Univariate

In [3]:
def make_hist(df, col, nbins=None, kde=False):
    """
    Inputs:
        - kde -> kernel density estimate, a way to esimate the probability density function of a 
                 random variable
    """
    sns.set_style('dark')
    sns.utils.axlabel(col, 'Frequency')
    sns.distplot(df[col], bins=nbins,kde=kde)

def compare_parametrical_distribution(df, col, nbins=None, par_distr = stats.gamma):
    """
    Fit a parametric distribution to a dataset and visually evaluate how closely it corresponds
    to the observed data (default gamma)
    """
    sns.distplot(df[col], kde=False, bins=nbins, fit=stats.gamma)
    
def make_boxplot(df, quant_col, qual_col=None):
    sns.set_style('dark')
    sns.boxplot(df[qual_col],df[quant_col])

#### Bivariate

In [4]:
def make_pairplot(df, col_subset="all", hue=None, diag_kind="hist",size=2.5):
    """
    Inputs
        - hue -> categorical feature (often the target variable)
        - diag_kind -> hist, kde
    """
    if col_subset == "all":
        sns.pairplot(df,hue=hue,diag_kind=diag_kind,size=size)
    else:
        sns.pairplot(df[col_subset],hue=hue,diag_kind=diag_kind,size=size)
        
def make_scatterplot(df, col_x, col_y,size=6):
    sns.jointplot(x=col_x, y=col_y, data=df,size=size)
    
def make_scatterplot_with_hue(df, col_x, col_y,hue=None,size=5):
    """
    Inputs:
        - hue -> categorical feature (often the target variable)
    """
    sns.FacetGrid(df, hue=hue,size=size) \
   .map(plt.scatter, col_x, col_y) \
   .add_legend()
def make_hexbin_plot(df, col_x, col_y,size=6):
    """
    The bivariate analogue of a histogram is known as a “hexbin” plot, because it shows the 
    counts of observations that fall within hexagonal bins.
    """
    with sns.axes_style("white"):
        sns.jointplot(x=df[col_x], y=df[col_y], kind="hex", color="k",size=size);

### Descriptive Statistics

In [5]:
def number_missing_values(df):
    missing_values = np.sum(df.isnull())/df.shape[0]
    return pd.DataFrame(missing_values.rename("Missing Values"))

def descriptive(df, n):
    """
    Inputs:
        n -> number of example values that you want to see in the output
    """
    df_describe = df.describe(include="all").T
    df_describe["Type"] = df.dtypes
    df_describe = df_describe[list(df_describe.columns)[-1:] + list(df_describe.columns)[0:-1]]
    df_describe["First " + str(n) + " values"] = df_describe.index.map(lambda x: df[x].dropna().unique()[:5])
    return df_describe

def find_duplicates(df, cols):
    """
    It returns the rows of the dataframe with duplicates
    """
    return df[df[cols].duplicated(keep = False)]

def first_n_unique_values(df, col, n):
    """
    It Returns the first n unique values of a given column of the dataframe df
    """
    unique_values = df[col][~df[col].isnull()].unique().tolist()
    if len(unique_values) < n:
        return unique_values
    else:
        return unique_values[:n]

## Data Pre-processing

### Missing Values

In [6]:
def fill_na(df, d):
    """
    Inputs:
        - d -> dictionary with column name and value to substitute missing values
    """
    df_copy = df.copy()
    for col in d.keys():
        df_copy[col] = df_copy[col].fillna(d[col])
    return df_copy

### Categorical Variables Encoding

In [7]:
def encoding(df, cols = None):
    """
    It returns the dataframe with encoded features (even with missing values) and the encoder, 
    if the list of columns is not provided then it encode all the object type features
    """
    if not cols:
        cols = list()
        for col in df.columns:
            if df[col].dtype == "O":
                cols.append(col)
            else:
                pass
    dict_le = defaultdict(preprocessing.LabelEncoder)
    df_new = df.copy()
    for col in cols:
        le = preprocessing.LabelEncoder()
        df_new = fill_na(df_new, {col: "NaN"})
        df_new[col] = le.fit_transform(df_new[col])
        dict_le[col] = le
    return df_new, dict_le

def decoding(df, cols, le):
    """
    Inputs:
        - le -> LabelEncoder from the encoding function
    """
    df_new = df.copy()
    for col in cols:
        df_new[col] = le.inverse_transform(df_new[col])
    return df_new

### Normalization & Standardization

In [8]:
def standardize(df, cols):
    """
    It Takes the columns as a list and give back the dataframe with the standardized columns
    """
    df_new = df.copy()
    for col in cols:
        df_new[col] = (df_new[col] - np.mean(df_new[col]))/np.std(df_new[col])
    return df_new

def normalize(df, cols):
    """
    It takes the columns as a list and give back the dataframe with the normalized columns
    """
    df_new = df.copy()
    for col in cols:
        df_new[col] = (df_new[col] - np.min(df_new[col]))/(np.max(df_new[col]) - np.min(df_new[col]))
        return df_new

## Feature Engineering

### Feature Binning

In [9]:
def binning(col, cut_points, labels=None):
    """
    Binning a single column  
    """
    minval = col.min()
    max_val = col.max()
    break_points = [minval] + cut_points + [maxval]
    if not labels:
        labels = range(len(cut_points)+1)
    colBin = pd.cut(col,bins=break_points,labels=labels,include_lowest=True)
    return colBinimport

def coding(col, codeDict):
    """
    It returns the column given with a new one with the old value (key) replaced by the 
    value given in the dictionary
    """
    colCoded = pd.Series(col, copy=True)
    for key, value in codeDict.items():
        colCoded.replace(key, value, inplace=True)
    return colCoded

### Interaction between best Features

In [10]:
def interaction_features(df, cols):
    """
    Returns a dict of name and numpy array with the new features
    """
    results = dict()
    for index in range(len(cols)):
        index_1 = index + 1
        while index_1 < len(cols): 
            results[cols[index]+"x"+cols[index_1]] = df[cols[index]]*df[cols[index_1]]
            if np.sum(df[cols[index_1]] == 0) == 0:
                results[cols[index]+"/"+cols[index_1]] = np.array(df[cols[index]])/np.array(df[cols[index_1]])
            index_1 += 1
    return results            

### Groupby and Apply

In [3]:
def many_aggregation_functions_groupby(df, keys, f):
    """
    Inputs:
        - keys -> list of columns
        - f -> dictionary in the form {variable: {"variable_name_mean":np.mean,
               "variable_name_min":np.min}} (does not work for single aggregation function)
    """
    df_gb = df.groupby(by=keys).agg(f).reset_index()
    df_gb.columns = df_gb.columns.droplevel()
    return df_gb

def groupby_function(df, key, f):
    """
    Inputs:
        - keys -> list of columns
        - f -> dictionary in the form {variable: {"variable_name_mean":np.mean,
               "variable_name_min":np.min}} (does not work for single aggregation function)
    """
    df_gb = df.groupby(by=key).agg(f).reset_index()
    return df_gb

def parallelized_groupby(df, keys, f):
    """
    Inputs:
        - keys -> list of columns
        - f -> dictionary in the form {variable: {"variable_name_mean":np.mean,
               "variable_name_min":np.min}} (does not work for single aggregation function)
    """
    grouped = df.groupby(keys)
    df_s = []
    for index in grouped.indices.values():
        df_s.append(df.iloc[index])
    func = delayed(groupby_function)
    res = Parallel(n_jobs=4, verbose=10)(func(s, keys, f) for s in df_s)
    return pd.concat(res)


def apply_df(df, f): 
    """
    inputs:
        - df -> dataframe
        - f -> function that takes row as an input and do some operation along row
    """
    return df.apply(f, axis=1) 

def parallelized_apply(df, apply_func, f, num_partitions):
    """
    Inputs:
        - df -> dataframe
        - apply_func -> function that takes row and f as an input and return df.apply()
        - f -> input of the apply_df
    """
    func = delayed(apply_func) 
    res = Parallel(n_jobs=4, verbose=10)(func(s, f=f) for s in np.array_split(df, num_partitions)) 
    return pd.concat(res) 

## Feature Selection

### Univariate

In [11]:
def select_univariate(X_train, y_train, n=None, score_function=selection.chi2):
    """
    Returns the new X with the best n columns
    Inputs:
        - n -> number of feature to select (in case of Select K best, otherwise None)
        - score_function -> chi-square default (classification) other here:
                            http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html#sklearn.feature_selection.SelectKBest
    """
    return selection.SelectKBest(score_func=score_function, k=n).fit_transform(X_train, y_train)

### Based on a Model

#### Recursive Feature Elimination

In [12]:
def recursive_elimination(X, y, n, model, step=1):
    """
    The Recursive Feature Elimination (RFE) method is a feature selection approach. 
    It works by recursively removing attributes and building a model on those attributes that 
    remain. It uses the model accuracy to identify which attributes (and combination of 
    attributes) contribute the most to predicting the target attribute.
    
    Inputs:
        - n -> number of features to select
        - model -> model used in order to find the best n variables
        - step -> int or float, optional (default=1), if greater than or equal to 1, then 
                  `step` corresponds to the (integer) number of features to remove at each 
                  iteration.
                  If within (0.0, 1.0), then `step` corresponds to the percentage
                  (rounded down) of features to remove at each iteration.
    Outputs:
        - support (True and False)
        - ranking
    """
    rfe = selection.RFE(model, n_features_to_select=n)
    return rfe.fit(X, y).support_, rfe.fit(X, y).ranking_ 

def recursive_elimination_CV(X, y, model, step = 1, nCvFold = 3, score = "accuracy"):
    """
    Input:
        - score: string ("accuracy" default) or a scorer callable object / function with signature 
          scorer(estimator, X, y)
    Output:
        - rfecv object with the following attributes:
            - n_features_ : The number of selected features with cross-validation
            - support_ : the mask of selected features.
            - ranking_ : the feature ranking, such that ranking_[i] corresponds to the ranking position of the i-th feature. Selected (i.e., estimated best) features are assigned rank 1.
            - grid_scores_ : the cross-validation scores such that grid_scores_[i] corresponds to the CV score of the i-th subset of features.
            - estimator_ : the external estimator fit on the reduced dataset.
    """
    rfecv = RFECV(estimator=model, step=step, cv=StratifiedKFold(nCvFold),
              scoring=score)
    rfecv.fit(X, y)
    return rfecv

def plot_RFECV(rfecv):
    """
    Input:
        - rfecv: rfecv object RFECV after fit
    """
    print("Optimal number of features : %d" % rfecv.n_features_)
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()

#### Feature importance Decision Trees

In [13]:
def feature_importance_trees(X, y, n, model):
    """
    Input:
        - n: number of feature to select
    Output:
        - X_selected_features: X with the selected features
    """
    model.fit(X,y)
    feature_importance = pd.DataFrame({"Feature": X.columns, \
                                       "Feature_Importance":model.feature_importances_})
    feature_importance.sort(columns="Feature_Importance",ascending=False, inplace=True)
    return X[feature_importance.Feature.tolist()[:n]]

## Model Validation and Selection

### Training-Validation-Test Set Split

In [14]:
def train_test_split_df(df, train_size = 0.67, random_state=42):
    """
    Output:
        - df_train, df_test
    """
    df_train, df_test = train_test_split(df, train_size=train_size, random_state=random_state)
    return df_train, df_test


def train_test_split_X_y(X, y, train_size = 0.67, random_state=42):
    """
    Output:
        - X_train, X_test, y_train, y_test
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, random_state=random_state)
    return X_train, X_test, y_train, y_test
    

### Hyperparameter Tuning

In [15]:
def grid_search_CV(X_train, y_train, model, parameters, n_cv= 3, scoring="accuracy"):
    """
    Input:
        - parameters: dictionary with name of the parameters as keys and a list of possible
                      values as values of the dictionary.
    """
    t0 = time.time()
    clf = GridSearchCV(model, parameters, cv=StratifiedKFold(n_splits=n_cv, shuffle=True, random_state=101).get_n_splits(X_train, y_train),
                       verbose=0, n_jobs=-1).fit(X_train, y_train)
    print('\nRFR Fitted in %0.3f[s]' %(time.time() - t0))
    print("Best score: " + str(clf.best_score_))
    clf_best = clf.best_estimator_
    return clf_best

def early_stopping_CV(X_train, y_train, params, n_boost_round = 3000, n_folds = 5, metrics = "error", metrics_min = True, early_stopping_rounds = 100):
    """
    Tuning the early stopping parameter using Cross Validation
    Output:
        - cv_xgb: pandas Dataframe with all the rounds
        - final_xgb: trained xgboost with the optimal number of boost_round
    """
    xgdmat = xgb.DMatrix(X_train, y_train)
    cv_xgb = xgb.cv(params = params, dtrain = xgdmat, num_boost_round = n_boost_round, nfold = n_folds,
                metrics = [metrics], # Make sure you enter metrics inside a list or you may encounter issues!
                early_stopping_rounds = early_stopping_rounds) # Look for early stopping that minimizes error
    if metrics_min == True:
        best_boost_rounds = cv_xgb[cv_xgb.iloc[:,0] == cv_xgb.iloc[:,0].min()].index[0]
    else:
        best_boost_rounds = cv_xgb[cv_xgb.iloc[:,0] == cv_xgb.iloc[:,0].max()].index[0]  
    final_xgb = xgb.train(params, xgdmat, num_boost_round = best_boost_rounds)
    print(xgb.plot_importance(final_xgb))
    return cv_xgb, final_xgb

def cross_validation(X_train, y_train, model, n_folds, score=metrics.accuracy_score):
    """
    Function that returns the scores of the n_folds iteration of the model training/prediction, if you take the mean
    you have an estimation of the performance of the model
    """
    scores   = []
    cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=101).split(X_train, y_train)
    for i, (train, test) in enumerate(cv):
        y_pred = model.fit(X.loc[train], y.loc[train]).predict(X.loc[test])
        scores.append(score(y[test], y_pred))
    return scores

### Model Evaluation

In [16]:
def classification_metrics(y_test, y_hat):
    """
    Input:
        - y_hat: y predicted by the model
    Output:
        - classification report: report with precision, recall and f1-score 
        - confusion matrix: y_hat in the rows and y_test in the columns
    """
    return classification_report(y_test, y_hat), pd.crosstab(y_hat, y_test)

def plot_ROC_curve(y_test, y_scores, labels):
    """
    Input:
        - y_scores: list of y_score from different classifiers
        - labels: list of labels of the different classifiers
    It prints the ROC of every classifier in the same axis
    """
    plt.figure(figsize=(10,10))
    plt.plot([0, 1], [0, 1], 'k--') # add a straight line representing a random model 
    for index in range(len(y_scores)):
        # false positive and true positive rate for each class
        fpr, tpr, _ = metrics.roc_curve(y_test, y_scores[index])
        # area under the curve (auc) for each class
        roc_auc = metrics.auc(fpr, tpr)
        label = labels[index]
        plt.plot(fpr, tpr, label='ROC curve of {0} (area = {1:0.2f})'.format(label, roc_auc))
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.title('Receiver Operating Characteristic for Iris data set')
    plt.xlabel('False Positive Rate') # 1- specificity
    plt.ylabel('True Positive Rate') # sensitivity
    plt.legend(loc="lower right")
    plt.show()

def make_binary(x, thr):
    """
    Transform a score x in binary based on the threshold thr
    """
    if x >= thr:
        return 1
    else:
        return 0

def threshold_classification_metrics(y_test, y_score):
    """
    It works only for binary classification.

    Output:
        - df: dataframe with the main scores for classification for every possible threshold
    """
    df = pd.DataFrame({"Threshold": np.arange(0,1.01,0.01)})
    fpr, tpr, _ = metrics.roc_curve(y_test, y_score)
    df["Roc_auc"] = metrics.auc(fpr, tpr)
    df["Recall"] = df["Threshold"].map(lambda thr: metrics.recall_score(y_test, pd.Series(y_score).map(lambda x: make_binary(x, thr))))
    df["Precision"] = df["Threshold"].map(lambda thr: metrics.precision_score(y_test, pd.Series(y_score).map(lambda x: make_binary(x, thr))))
    df["F-1 Measure"] = df["Threshold"].map(lambda thr: metrics.f1_score(y_test, pd.Series(y_score).map(lambda x: make_binary(x, thr))))
    df["Kappa Score"] = df["Threshold"].map(lambda thr: metrics.cohen_kappa_score(y_test, pd.Series(y_score).map(lambda x: make_binary(x, thr))))
    return df
    
def from_Percentile_to_Decile(x):
    """
    Function used in the lift_raw function
    """
    for index in [x*0.10 for x in range(1, 11)]:
        if x <= index:
            return str(round(index - 0.10,2)) + " - " + str(round(index,2))
        else:
            pass

def lift_raw(y_true, y_score):
    """
    Output:
        - df: dataframe with Deciles and Percentiles of the y_score for every observation
    """
    df = pd.DataFrame({"y_true": y_true, "y_score": y_score}).sort(columns="y_score", ascending=False)
    df["Percentile"] = np.arange(df.shape[0], 0, -1) / df.shape[0]
    df["Decile"] = df["Percentile"].map(lambda x: from_Percentile_to_Decile(x))
    return df    

def lift(y_true, y_score):
    """
    Output:
        - df: dataframe with the mean y_true and the lift score for every Decile of the y_score 
    """
    df = lift_raw(y_true, y_score)
    df = df.groupby(by="Decile").mean().reset_index()
    df.drop("Percentile",axis=1,inplace=True)
    df["Overall"] = np.sum(y_true)/y_true.shape[0]
    df["Lift"] = df["y_true"] / df["Overall"]
    return df

### Stacking Best n for every different Algorithm

In [98]:
def stacking(X_train, y_train, X_test, y_test, models, n_folds = 10):
    """
    Stacking function that returns the dataset with the prediction of the different models for the train and for the test
    set.
    """
    dataset_blend_train = np.zeros((X_train.shape[0], len(clfs)))
    dataset_blend_test = np.zeros((X_test.shape[0], len(clfs)))
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=101).split(X_train, y_train)
    dataset_blend_test_j = np.zeros((X_test.shape[0], n_folds))
    e_skf = list(enumerate(skf))
    for j, model in enumerate(models):
        print("Model number " + str(j) + "\n", model)
        for i, (train, test) in e_skf:
            model.fit(X_train.loc[train], y_train.loc[train])
            y_pred = model.predict_proba(X_train.loc[test])[:, 1]
            dataset_blend_train[test, j] = y_pred
            dataset_blend_test_j[:, i] = model.predict_proba(X_test)[:, 1]
        dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)
    return dataset_blend_train, dataset_blend_test