In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import sys #access to system parameters
print("Python version: {}". format(sys.version))

import numpy as np # linear algebra
print("NumPy version: {}". format(np.__version__))
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
print("pandas version: {}". format(pd.__version__))

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# Suppress warnings from pandas
import warnings
warnings.filterwarnings('ignore')

#plt.style.use('fivethirtyeight')

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
def get_feature_names(df):
    return df.columns.values

# Print a summary of all the features in the dataset
def data_summary (df, features_dict):
    # Get Feature names
    print('Features are ', get_feature_names(df))
    
    print('_'*40, 'Categories of all features', '_'*20)
    print (features_dict)
    
    print('_'*40, 'Datatypes of all features', '_'*20)
    # Print datatypes of all the features
    df.info()

    # Stats of the features
    # Check the quantiles and min/max to see if any of the features have anomalies or outliers
    df.describe(include = 'all')
    
def data_preview (df):
    #df.head()
    #df.tail()
    df.sample(10)

In [None]:
# Get a list of all the unique values for a column
def get_unique_values (df, target):
    return df[target].unique()

# Get a count of unique values for 'Object' columns
def get_unique_counts (df):
    return (df.select_dtypes('object').apply(pd.Series.nunique, axis = 0))

# Print the most frequent values of each categorical column
def count_freq (df, cat_cols):
    for col in cat_cols:
        print (df[col].value_counts().nlargest())

# Get all the datatypes in the given df
def get_dtypes (df):
    dtype_df = df.dtypes.reset_index()
    dtype_df.columns = ["Feature", "Column Type"]
    return (dtype_df)
        
# Get all the number-type columns in the dataframe
def get_numeric_columns (df):
    return (df.select_dtypes('number').columns)

# Function to calculate missing values by column
def missing_values_summary(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [None]:
# VISUALISATION PLOTTING
#---------------------

class Plotter(object):
    # Plots only the subplots in the grid for which we have data
    def plot_grid (self, ptype, df, features, target=None):
        grid_size = 4
        fig = plt.figure(figsize = (18, 18))
        plt.subplots_adjust(top=.85, hspace=0.3, wspace=0.3)

        for i, feature in enumerate(features):
            ax = fig.add_subplot(grid_size, grid_size, i + 1)
            plot_func = self._get_plottype (ptype)
            plot_func (df, feature, target)
    
    def _get_plottype (self, ptype):
        """Dispatch method"""
        method_name = '_plot_' + ptype
        # Get the method from 'self'. Default to a lambda.
        method = getattr(self, method_name, lambda: "Invalid plot type")
        # Return the method
        return method
 
    def _plot_count(self, df, x, y):
        sns.countplot(x=x, data=df)
 
    def _plot_dist(self, df, x, y):
        #sns.distplot(df[x], kde=False)
        sns.distplot(df[x])
 
    def _plot_bar(self, df, x, y):
        sns.barplot(x=x, y=y, data=df)

    def _plot_multicol(self, df, x, y):
        sns.barplot(df.index, df[x])
        
    def _plot_series(self, df, x, y):
        sns.barplot(df.index, df.values)
        
    def _plot_series_horiz(self, df, x, y):
        sns.barplot(df.values, df.index)
        
    def _plot_box(self, df, x, y):
        sns.boxplot(df[x], df[y])
        
    def _plot_reg(self, df, x, y):
        sns.regplot(x=x, y=y, data=df)

    def _plot_scatter(self, df, x, y):
        sns.scatterplot(x=x, y=y, data=df)
        
def plot_corr(df):
    # Heatmap of correlations
    plt.figure(figsize = (8, 6))
    sns.heatmap(df.corr(), vmin = -0.25, annot = True, vmax = 0.6, cmap = plt.cm.RdYlBu_r, fmt='.2f', linewidths=.05)
    plt.title('Correlation Heatmap');

In [None]:
# BIVARIATE
#---------------------

# Plots bar plots between a feature and a target pair
def plot_pair_target (df, features, target):
    grid_size = 4
    fig = plt.figure(figsize = (15, 15))
    plt.subplots_adjust(top=.85, hspace=0.3, wspace=0.3)
    
    for i, feature in enumerate(features):
        ax = fig.add_subplot(grid_size, grid_size, i + 1)
        sns.barplot(x = feature, y = target, data=df, ax = ax)
        ax.set_xlabel("Values")
        ax.set_ylabel("Counts") 
        ax.set_title('{} Totals'.format(feature), fontsize=16)

# Plot Stacked histograms of each feature vs target
def plot_stackhist(df, features, target):
    grid_size = 4
    fig = plt.figure(figsize = (20, 20))
    plt.subplots_adjust(top=.85, hspace=0.3, wspace=0.3)
    
    # Get list of unique values of the target
    target_values = get_unique_values (df, target)
        
    for i, feature in enumerate(features):
        ax = fig.add_subplot(grid_size, grid_size, i + 1)
        # For each value of the target
        dist = [df[df[target]==val][feature] for val in target_values]
        ax.hist(x = dist, stacked=True, label = target_values)
        ax.set_xlabel("Values")
        ax.set_ylabel("Counts") 
        ax.set_title('{} Distribution'.format(feature), fontsize=16)
        ax.legend()

In [None]:
def plot_pair(df, features):
    # Pair-wise Scatter Plots
    pp = sns.pairplot(df[features], size=1.8, aspect=1.8,
                  plot_kws=dict(edgecolor="k", linewidth=0.5),
                  diag_kind="kde", diag_kws=dict(shade=True))

    fig = pp.fig 
    fig.subplots_adjust(top=0.93, wspace=0.3)
    t = fig.suptitle('Pairwise Plots', fontsize=14)


In [None]:
def data_stats (df):
    p = Plotter()
    
    # Get all the datatypes
    dd = get_dtypes (df)
    p.plot_grid ('count', dd, ['Column Type'])
    
    # Unique value counts
    ud = get_unique_counts(train_df)
    p.plot_grid ('series', ud, ['dummy'])

    # Missing value counts
    miss_val = missing_values_summary(train_df)
    cols = get_feature_names(miss_val)
    p.plot_grid ('multicol', miss_val, cols)

In [None]:
def data_distrib (df, cat_cols, num_cols):
    p = Plotter()
    #Frequency of categorical values
    p.plot_grid ('count', df, cat_cols)

    # Histograms of Numerical features
    p.plot_grid ('dist', df, num_cols)

In [None]:
def is_categorical(df, col):
    return df[col].dtype.name == 'category'

def data_target (df, cat_cols, num_cols, target):
    p = Plotter()
    if (is_categorical (df, target)):
        plot_pair_target (train_df, cat_cols, target)
        #plot_stackhist(train_df, ['Pclass', 'SibSp', 'Parch', 'Fare', 'Age', 'Sex'], 'Survived')
    else:
        p.plot_grid ('scatter', df, num_cols, target)
        p.plot_grid ('box', df, cat_cols, target)

In [None]:
from sklearn.preprocessing import Imputer
def impute_missing_values (df, cols):
    # Get the numeric columns and impute average values for them
    num_df = df[cols].select_dtypes(exclude=[object])
    imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imr = imr.fit(num_df)
    df[num_df.columns] = imr.transform(num_df.values)
    
    # Get the categorical columns and impute the mode values for them
    cat_df = df[cols].select_dtypes('object')
    cat_df.fillna(cat_df.mode().iloc[0], inplace = True)
    df[cat_df.columns] = cat_df

# Remove columns which are not selected for the model
def remove_cols (dfs, cols):
    for df in dfs:
        #print("DF shape before", df.shape)
        df.drop(cols, axis=1, inplace = True)
        #print("DF shape after", df.shape)

# Function to fill all missing numeric values with 0
def fill_missing (df):
    # Iterate through the numeric columns 
    for col in df.select_dtypes('number').columns:
        df[col] = df[col].fillna(0)

In [None]:
# Convert a numeric column to a categorical column based on ranges defined by bins
def num_to_cat (df, num_col, cat_col, bins, labels):
    df[cat_col] = pd.cut(df[num_col], bins=bins, labels=labels)
    
from sklearn.preprocessing import LabelEncoder

# Label encode categorical columns
def encode_cat (df, cat_col, enc_col):
    gle = LabelEncoder()
    # Fill missing values with a dummy string, otherwise fit_transform throws an error
    #cat_labels = gle.fit_transform(df[cat_col].fillna('Empty'))
    cat_labels = gle.fit_transform(df[cat_col])
    #cat_mappings = {index: label for index, label in enumerate(gle.classes_)}
    df[enc_col] = cat_labels
    
# One-hot encode categorical columns
def encode_cat_onehot (df, cat_col):
    # Drop the first encoded value to avoid the "dummy variable trap"
    onehot_df = pd.get_dummies(df[cat_col], drop_first=True)
    #onehot_df = pd.get_dummies(df[cat_col].fillna('Empty'), drop_first=True)
    return (pd.concat([df, onehot_df], axis=1))

# Min-Max Scaling
from sklearn import preprocessing 
def minmax_scaling (df, cols):
    # Get np array for scaling. 
    # Adjust to 2D if cols contains only a single element
    x = df[cols].values
    if (x.ndim == 1):
        x = x.reshape(-1,1)
    
    min_max_scaler = preprocessing.MinMaxScaler(feature_range =(0, 1)) 
    x_after_min_max_scaler = min_max_scaler.fit_transform(x)
    df[cols] = x_after_min_max_scaler

In [None]:
# Plot correlations of features with the target feature
def correlations_target(df, target):
    return (df.corr()[target].sort_values())

# Plot correlations with target and also heatmap of correlation between
# all features
def data_correlations (df, num_cols, target):
    tc = correlations_target(df, target)
    p = Plotter()
    p.plot_grid ('series_horiz', tc, ['dummy'], 'dummy')
    
    # Heatmap of correlations between all features
    plot_corr (df)

In [None]:
def agg_numeric(df, group_var, stats, df_name):
    """Aggregates the numeric values in a dataframe. This can
    be used to create features for each instance of the grouping variable.
    
    Parameters
    --------
        df (dataframe): 
            the dataframe to calculate the statistics on
        group_var (string): 
            the variable by which to group df
        df_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated for 
            all numeric columns. Each instance of the grouping variable will have 
            the statistics (mean, min, max, sum; currently supported) calculated. 
            The columns are also renamed to keep track of features created.
    
    """
    # Group by the specified variable and calculate the statistics
    agg = df.groupby(group_var).agg(stats).reset_index()
    
    # Need to create new column names
    columns = [group_var]

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1][:-1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))

    agg.columns = columns
    return agg

In [None]:
def copy_df (df):
    #Create a copy of data before manipulation.Python assignment or equal passes by 
    # reference vs values, so we use the copy function
     return (df.copy(deep = True))

# Get intersection between two dataframes based on values in a column
def intersect_col (df1, df2, col):
    return pd.Series (list (set(df1[col]) & set (df2[col])))

# Get difference between two dataframes based on values in a column
def diff_col (df1, df2, col):
    return df1[~df1[col].isin(df2[col])]

In [None]:
# Get polynomial features
from sklearn.preprocessing import PolynomialFeatures

def polynomial_features (df, test_df):
    poly = PolynomialFeatures(degree=2).fit(df)
    poly_df = poly.transform(df)
    test_poly_df = None
    if (test_df):
        test_poly_df = poly.transform(test_df)
    return (poly_df, test_poly_df)

In [None]:
# Plot learning curve for different sizes of the training samples
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):    
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, optional (default=None)
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like, shape (n_ticks,), dtype float or int
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the dtype is float, it is regarded as a
        fraction of the maximum size of the training set (that is determined
        by the selected validation method), i.e. it has to be within (0, 1].
        Otherwise it is interpreted as absolute sizes of the training sets.
        Note that for classification the number of samples usually have to
        be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """

    if ylim is not None:
        plt.ylim(*ylim)
    
    # Create Cross Validation training and test scores for various training set sizes
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    
    # Create means and standard deviations of training set scores
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    
    # Create means and standard deviations of test set scores
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    # Draw curve lines
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Validation score")

    # Draw bands
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    
    # Titles, labels, grid, legend etc
    plt.title(title)
    plt.xlabel("Training Set Size"), plt.ylabel("Accuracy Score"), plt.legend(loc="best")
    plt.grid()

    return plt

In [None]:
# Plot validation curve for different hyperparameter values
def plot_validation_curve(estimator, title, X, y, param_name, param_range, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    
    # Create validation scores
    train_scores, test_scores = validation_curve(estimator, X, y, param_name, param_range, cv)
    
    # Create means and standard deviations of training set scores
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    
    # Create means and standard deviations of test set scores
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    
     # Draw curve lines
    plt.plot(param_range, train_mean, color='r', marker='o', markersize=5, label='Training score')
    plt.plot(param_range, test_mean, color='g', linestyle='--', marker='s', markersize=5, label='Validation score')
    
    # Draw bands
    plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color='r')
    plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color='g')
    
    # Titles, labels, grid, legend etc
    plt.title(title)
    plt.xlabel('Parameter values'), plt.ylabel('Accuracy Score'), plt.legend(loc='best') 
    plt.grid() 
    plt.xscale('log')
    plt.ylim(ylim)

In [None]:
def run_model (X, y):
    # Fit logistic regression
    logreg = LogisticRegression()
    logreg.fit(X, y)

    # Model performance
    scores = cross_val_score(logreg, X, y, cv=10)
    return (logreg, scores)

In [None]:
# Run a cross-validation model with the given data, and plot the learning 
# and validation curves
def validate_model (X, y):
    # Split data into training and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1)

    logreg, scores = run_model (X_train, y_train)
    print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

    # Figure and axes to plot learning and validation curves
    fig, axs = plt.subplots(1,2, figsize = (12, 5))
    
    # Plot learning curves
    title = "Learning Curves (Logistic Regression)"
    cv = 10
    plt.sca(axs[0]) # Set the current axes to first subplot
    plot_learning_curve(logreg, title, X_train, y_train, ylim=(0.7, 1.01), cv=cv, n_jobs=1);

    # Plot validation curve
    title = 'Validation Curve (Logistic Regression)'
    param_name = 'C'
    param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] 
    cv = 10
    plt.sca(axs[1]) # Set the current axes to next subplot
    plot_validation_curve(estimator=logreg, title=title, X=X_train, y=y_train, param_name=param_name,
                          ylim=(0.5, 1.01), param_range=param_range);

In [None]:
# Select features using chi-squared test
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Select features
def selected_df (X, y, k):
    select = SelectKBest(score_func=chi2, k=k)
    select.fit(X, y)
    X_selected = select.transform(X)
    return (X_selected)

## Get score using models with feature selection
def feature_select (X, y, base_score):
    highest_score = base_score
    for i in range(1, X.shape[1]+1, 1):
        # Select i features
        X_selected = selected_df (X, y, i)

        # Model with i features selected
        _, scores = run_model (X_selected, y)
        print('CV accuracy (number of features = %i): %.3f +/- %.3f' % (i, 
                                                                         np.mean(scores), 
                                                                         np.std(scores)))

            # Save results if best score
        if np.mean(scores) > highest_score:
            highest_score = np.mean(scores)
            std = np.std(scores)
            k_features_highest_score = i
        elif np.mean(scores) == highest_score:
            if np.std(scores) < std:
                highest_score = np.mean(scores)
                std = np.std(scores)
                k_features_highest_score = i

    # Print the number of features
    print('Number of features when highest score: %i' % k_features_highest_score)
    return (k_features_highest_score)

In [None]:
# UNUSED - These functions have been superseded by better implementations and are
# no longer used
# --------------------------

# Plot a bar chart of a Series
def plot_series (df):
    fig = plt.figure(figsize = (5, 5))
    ax = plt.axes()
    
    data_x, data_y = df.index, df.values
    ax.bar(data_x, data_y, color='lightseagreen')
    ax.set_xlabel("Features")
    ax.set_ylabel("Counts") 
    ax.set_title('Unique', fontsize=16)

# Plots all the columns in a dataframe against the index
def plot_multicolumn (df, features):
    grid_size = 4
    fig = plt.figure(figsize = (15, 15))
    plt.subplots_adjust(top=.85, hspace=0.3, wspace=0.3)
    
    for i, feature in enumerate(features):
        ax = fig.add_subplot(grid_size, grid_size, i + 1)
        data_x, data_y = df.index, df[feature].values
        #ax.bar(data_x, data_y, color='teal')
        sns.barplot (data_x, data_y)
        ax.set_xlabel("Values")
        ax.set_ylabel("Counts") 
        ax.set_title('{} Totals'.format(feature), fontsize=16)

def plot_bar (df, features):
    grid_size = 4
    plt.subplots_adjust(top=.85, hspace=0.7, wspace=0.4)
    
    for i, feature in enumerate(features):
        plt.subplot(grid_size, grid_size, i + 1)
        #df.groupby(feature)['Survived'].agg('sum').nlargest().plot (kind = 'bar')
        #df[feature].value_counts().plot (kind = 'bar')
        ax = df[feature].value_counts().nlargest().plot.bar(
            figsize=(15, 15),
            color='mediumvioletred',
            fontsize=12
        )
        ax.set_xlabel("Values")
        ax.set_ylabel("Counts") 
        ax.set_title('{} Totals'.format(feature), fontsize=16)

# Plot a single column
def plot_column (df, col):
    fig = plt.figure(figsize = (5, 5))
    ax = plt.axes()
    
    data_x, data_y = df.index, df[col].values
    ax.bar(data_x, data_y, color='mediumvioletred')
    ax.set_xlabel("Features")
    ax.set_ylabel("Counts") 
    ax.set_title(col, fontsize=16)

# Plots only the subplots in the grid for which we have data
def plot_subplots (df, features):
    grid_size = 4
    fig = plt.figure(figsize = (15, 15))
    plt.subplots_adjust(top=.85, hspace=0.3, wspace=0.3)
    
    for i, feature in enumerate(features):
        ax = fig.add_subplot(grid_size, grid_size, i + 1)
        ax.text(0.5, 0.5, str(i),
                      fontsize=12, ha='center')
        data = df[feature].value_counts().nlargest()
        data_x, data_y = data.index, data.values
        ax.bar(data_x, data_y, color='green')
        #sns.barplot(data_x, data_y)
        ax.set_xlabel("Values")
        ax.set_ylabel("Counts") 
        ax.set_title('{} Totals'.format(feature), fontsize=16)

def plot_hist(df, features):
    grid_size = 4
    fig = plt.figure(figsize = (15, 15))
    plt.subplots_adjust(top=.85, hspace=0.3, wspace=0.3)
    
    for i, feature in enumerate(features):
        ax = fig.add_subplot(grid_size, grid_size, i + 1)
        ax.hist(df[feature], color='orange')
        ax.set_xlabel("Values")
        ax.set_ylabel("Counts") 
        ax.set_title('{} Distribution'.format(feature), fontsize=16)
        
# PIVOT (PAIR)
# Pivot the nominal, ordinal and discrete features against the Target
def plot_pivot_pair (df, features, target):
    grid_size = 4
    fig = plt.figure(figsize = (15, 15))
    plt.subplots_adjust(top=.85, hspace=0.3, wspace=0.3)
    
    for i, feature in enumerate(features):
        ax = fig.add_subplot(grid_size, grid_size, i + 1)
        pdf = pivot_pair (df, feature, target)
        ax.bar(pdf[feature], pdf[target], color='olive')
        ax.set_xlabel("Values")
        ax.set_ylabel("Counts") 
        ax.set_title('{} Totals'.format(feature), fontsize=16)

# Plots a complete grid
def plot_grid (df, features):
    grid_size = 4
    fig, axs = plt.subplots(grid_size, grid_size, figsize=(15, 15))
    plt.subplots_adjust(top=.85, hspace=0.3, wspace=0.3)
    
    for i, feature in enumerate(features):
        row, col = i // grid_size, i % grid_size
        ax = axs[row, col]
        ax.text(0.5, 0.5, str(i),
                      fontsize=12, ha='center')
        data = df[feature].value_counts().nlargest()
        data_x, data_y = data.index, data.values
        ax.bar(data_x, data_y, color='mediumvioletred')
        ax.set_xlabel("Values")
        ax.set_ylabel("Counts") 
        ax.set_title('{} Totals'.format(feature), fontsize=16)        
       
def plot_box(df, features):
    grid_size = 4
    fig = plt.figure(figsize = (15, 15))
    plt.subplots_adjust(top=.85, hspace=0.3, wspace=0.3)
    
    for i, feature in enumerate(features):
        ax = fig.add_subplot(grid_size, grid_size, i + 1)
        ax.boxplot(df[feature], showmeans = True, meanline = True)
        ax.set_xlabel("Values")
        ax.set_ylabel("Counts") 
        ax.set_title('{} Distribution'.format(feature), fontsize=16)
        
plot_pivot_pair (train_df, ['Sex', 'Embarked', 'SibSp', 'Parch', 'Pclass'], 'Survived')

#plot_subplots (train_df, ['Survived', 'Sex', 'Embarked', 'Pclass', 'SibSp', 'Parch'])
#plot_bar(train_df, ['Survived', 'Sex', 'Embarked', 'Pclass', 'SibSp', 'Parch'])
#plot_hist(train_df, ['Survived', 'Pclass', 'SibSp', 'Parch', 'Fare'])

# Distribution of all features
#train_df.hist(bins=15, color='steelblue', edgecolor='black', linewidth=1.0,xlabelsize=8, ylabelsize=8, grid=False)    
#plt.tight_layout(rect=(0, 0, 1.2, 1.2))

# Function to calculate pair-wise correlations between all columns and a target column
# Function replaced by correlations_target()
def target_corrs(df, target):

    # List of correlations
    corrs = []

    # Iterate through the numeric columns 
    for col in df.select_dtypes('number').columns:
        # Skip the target column
        if col != target:
            # Calculate correlation with the target
            corr = df[target].corr(df[col])

            # Append the list as a tuple
            corrs.append((col, corr))
            
    # Sort by absolute magnitude of correlations
    corrs = sorted(corrs, key = lambda x: abs(x[1]), reverse = True)
    
    return corrs

# Pair-wise correlations with the Target feature
tc = target_corrs (train_df, 'Survived')

# Plots the disribution of a variable colored by value of the target
def kde_target(var_name, df, target):
    
    # Calculate the correlation coefficient between the new variable and the target
    corr = df[target].corr(df[var_name])
    
    plt.figure(figsize = (12, 6))
    
    # Get list of unique values of the target
    target_values = get_unique_values (df, target)
    
    # Plot the distribution for each value of the target
    for val in target_values:
        sns.kdeplot(df.ix[df[target] == val, var_name], label = 'TARGET == ' + str(val))
    
    # label the plot
    plt.xlabel(var_name); plt.ylabel('Density'); plt.title('%s Distribution' % var_name)
    plt.legend();
    
    # print out the correlation
    print('The correlation between %s and the target %s is %0.4f' % (var_name, target, corr))
    
    # Print out median values
    for val in target_values:
        avg_val = df.ix[df[target] == val, var_name].median()
        print('Median value of %s for %s value %d = %0.4f' % (var_name, target, val, avg_val))

# Plot the Density plot of Age for different values of Survived
kde_target('Age', train_df, 'Survived')

# Pivot between a feature column and the target feature
def pivot_pair (df, col, target):
    return (df[[col, target]]).groupby([col], as_index=False).mean().sort_values(by=target, ascending=False)

# Pivot table to compute Target values for each combination between one or more rows and one or more columns
def pivot_multi (df, target, rows, cols):
    return (df.pivot_table(target, index=rows, columns=cols))

def plot_num_facet (df, target, row_col, num_col):
    g = sns.FacetGrid(df, col=target, row = row_col, height=2.2, aspect=1.6)
    g.map(plt.hist, num_col, alpha=.5, bins=20)
    g.add_legend()

plot_num_facet (train_df, 'Survived', None, 'Age')
plot_num_facet (train_df, 'Survived', 'Pclass', 'Age')
pivot_multi (train_df, 'Survived', ['Sex', 'Pclass'], 'Embarked')

## Wine Review Dataset

In [None]:
reviews = pd.read_csv("../input/wine-reviews/winemag-data_first150k.csv", index_col=0)

# Keep only the lower-priced wines...
md = reviews[reviews['price'] < 120]

# .. and split the numeric Points into bins and convert to a categorical Score...
bins = [0,84,89,100]
labels=[1, 2, 3]
num_to_cat (md, 'points', 'score', bins, labels)

# ... and Plot the Density plot of Price for different values of the Score
kde_target('price', md, 'score')

# Find missing values...
mt = missing_values_summary(reviews)

# ... and find any columns which have more than 90% missing values, and drop them from the data
mt_cols = list(mt.index[mt['% of Total Values'] > 90])
reviews = reviews.drop(columns = mt_cols)

# Compute aggregate statistics for numeric columns ...
agd = agg_numeric(md, group_var = 'country', stats = ['count', 'mean', 'max', 'min', 'sum'], df_name = 'ctry')

# ...and merge with the main data, and fill in empty values with 0s
agm = md.merge(agd, on = 'country', how = 'left')
fill_missing (agm)

agc = target_corrs (agm, 'points')
agc[:15]

# Encode the categorical Country name as a numeric Country ID
encode_cat (agm, 'country', 'country_id')
agm.head()

# One-hot encode the categorical Variety column
od = encode_cat_onehot (agm[['country', 'country_id', 'variety']], 'variety')
od.head()

# Note that we are aggregating stats for categorical columns which have now been one-hot encoded.
# Since these were not originally numeric values, the most meaningful stats are mean and sum
# Other stats like min and max do not make sense for such columns
# 
# The sum represents the count of that category value for that Country ID and 
# the mean represents the normalized count of that category value for that Country ID 
# ie. count of that value / total count of all values for that Country ID
# One-hot encoding makes the process of calculating these counts and normalised counts very easy
odm = agg_numeric(od, group_var = 'country_id', stats = ['mean', 'sum'], df_name = 'ctry')

# For some reason this step takes a long time and spikes the CPU to 100%. Need to check why.
odn = od.merge(odm, on = 'country_id', how = 'left')
odn.head()

## Titanic Dataset

### Load the data and Preview it

In [None]:
# Load training data
train_df = pd.read_csv('../input/titanic-machine-learning-from-disaster/train.csv')

# Categorise features by type
target_feature = ['Survived'] # Nominal
nominal_features = ['Survived', 'Sex', 'Embarked']
ordinal_features = ['Pclass']
numeric_continuous_features = ['Age', 'Fare']
numeric_discrete_features = ['SibSp', 'Parch']
remove_features = ['PassengerId', 'Ticket', 'Cabin', 'Name'] # These may require data to be cleaned
features_dict = {'nominal': nominal_features,
                'ordinal' : ordinal_features,
                'continuous' : numeric_continuous_features,
                'discrete' : numeric_discrete_features,
                'remove' : remove_features,
                'target' : target_feature}

# preview the data
train_df.sample(10)

### Summarise the data
* Shape, Columns, Datatypes
* Min-max Values
* Stats like Standard Dev, Quartiles
* Anomalies and Outliers

In [None]:
# Check the quantiles and min/max to see if any of the features have anomalies or outliers
train_df.describe(include = 'all')

In [None]:
data_summary (train_df, features_dict)

### Visualise the data
* Number of unique values for Categorical features
* Distribution of the data
* Missing Values

In [None]:
data_stats (train_df)

In [None]:
data_distrib (train_df, ['Survived', 'Sex', 'Embarked', 'Pclass', 'SibSp', 'Parch'], ['Age', 'Fare'])

### Correct and Clean the data
* Remove any unnecessary columns
* Remove columns with majority of missing values
* Remove or clean outlier data

In [None]:
# CORRECT & CLEAN
# --------------------------

# Exclude features which will not impact the Target feature
remove_features = ['PassengerId','Cabin', 'Ticket']
remove_cols ([train_df], remove_features)

### Complete the data
* Fill any missing values

In [None]:
# COMPLETE
# --------------------------

# Impute all missing values
impute_missing_values (train_df, ['Age', 'Embarked'])

### Convert the data
* Convert Numeric Continuous features to Categorical by Binning
* Encode Categorical Nominal (text) features with Label Encoding or One-Hot Encoding
* Scale/Normalise Numeric Continuous features

In [None]:
# CONVERT
# --------------------------
# Define categorical variables
train_df['Sex'] = pd.Categorical(train_df['Sex'])
train_df['Embarked'] = pd.Categorical(train_df['Embarked'])

# Convert numerical features to ordinal by binning
num_to_cat (train_df, 'Age', 'AgeBin', 8, range(1, 9))
num_to_cat (train_df, 'Age', 'AgeType', bins=[0, 12, 50, 200], labels=['Child','Adult','Elder'])

# Label encode categorical features
encode_cat (train_df, 'Sex', 'SexCode')
encode_cat (train_df, 'Embarked', 'EmbarkedCode')

# One-hot encode categorical features
# NB: This works only for categorical features with string values not numbers
# So doing this for the same features again simply as a test exercise
train_df = encode_cat_onehot (train_df, ['Sex', 'Embarked'])

In [None]:
train_df.sample(5)

### Run a Baseline Model
* Try variations with Label-Encoded and One-Hot Encoded versions
* Try variations with and without Normalised feature values

In [None]:
# First try with label encoded features
X = train_df [['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'SexCode', 'EmbarkedCode']]
y = train_df ['Survived']

validate_model (X, y)

In [None]:
# Try with one-hot encoded features
X = train_df [['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S']]
y = train_df ['Survived']

validate_model (X, y)

In [None]:
# CONVERT
# --------------------------
# Min-max scaling of numerical features
minmax_scaling (train_df, ['Age', 'Fare'])

# Try with one-hot encoded features
X = train_df [['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S']]
y = train_df ['Survived']

validate_model (X, y)

### Exploratory Data Analysis - Univariate

### Exploratory Data Analysis - Bivariate (with the Target feature)

In [None]:
data_target (train_df, ['Sex', 'Embarked', 'SibSp', 'Parch', 'Pclass', 'Title', 'AgeBin', 'AgeType', 'FamilySize'], ['Age', 'Fare'], 'Survived')
plot_stackhist(train_df, ['Pclass', 'SibSp', 'Parch', 'Fare', 'Age', 'Sex'], 'Survived')

### Create new features based on Domain Knowledge

In [None]:
# CREATE
# --------------------------
# Create Family feature
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch']

X = train_df [['Pclass', 'Age', 'FamilySize', 'Fare', 'SexCode', 'EmbarkedCode']]
y = train_df ['Survived']

validate_model (X, y)

In [None]:
# CREATE
# --------------------------
# Create Title feature

# Extract titles from name
train_df['Title']=0
for i in train_df:
    train_df['Title']=train_df['Name'].str.extract('([A-Za-z]+)\.', expand=False)  # Use REGEX to define a search pattern

# Count how many people have each of the titles
print (train_df.groupby(['Title'])['Name'].count())

# Map of aggregated titles, so that less frequently used titles can be
# merged with the common ones
titles_dict = {'Capt': 'Other',
               'Major': 'Other',
               'Jonkheer': 'Other',
               'Don': 'Other',
               'Sir': 'Other',
               'Dr': 'Other',
               'Rev': 'Other',
               'Countess': 'Other',
               'Dona': 'Other',
               'Mme': 'Mrs',
               'Mlle': 'Miss',
               'Ms': 'Miss',
               'Mr': 'Mr',
               'Mrs': 'Mrs',
               'Miss': 'Miss',
               'Master': 'Master',
               'Lady': 'Other'}

# Group titles
train_df['Title'] = train_df['Title'].map(titles_dict)

# Transform into categorical
train_df['Title'] = pd.Categorical(train_df['Title'])

# Fill missing values and encode
train_df['Title'] = train_df['Title'].fillna('Other')
encode_cat (train_df, 'Title', 'TitleCode')

In [None]:
data_correlations (train_df, ['dummy'], 'Survived')

### Create new features based on Polynomials

In [None]:
# CREATE (POLYNOMIAL)
# --------------------------

X = train_df [['Pclass', 'Age', 'FamilySize', 'Fare', 'SexCode', 'EmbarkedCode', 'TitleCode']]
X, _ = polynomial_features (X, None)
y = train_df ['Survived']

validate_model (X, y)

### Feature Selection
* Remove redundant features
* Select the best combination of features

In [None]:
base_score = 0.792
k_best_features = feature_select(X, y, base_score)

In [None]:
# Validate the model with the best selection of features
X_best = selected_df (X, y, k_best_features)
validate_model (X_best, y)

In [None]:
# AGGREGATE
# --------------------------

# Compute aggregate statistics for numeric columns ...
#agd = agg_numeric(train_df, group_var = 'country', stats = ['count', 'mean', 'max', 'min', 'sum'], df_name = 'ctry')
# ...and merge with the main data, and fill in empty values with 0s
#agm = md.merge(agd, on = 'country', how = 'left')
#fill_missing (agm)

# Note that we are aggregating stats for categorical columns which have now been one-hot encoded.
# Since these were not originally numeric values, the most meaningful stats are mean and sum
# Other stats like min and max do not make sense for such columns
# 
# The sum represents the count of that category value for that Country ID and 
# the mean represents the normalized count of that category value for that Country ID 
# ie. count of that value / total count of all values for that Country ID
# One-hot encoding makes the process of calculating these counts and normalised counts very easy
#odm = agg_numeric(od, group_var = 'country_id', stats = ['mean', 'sum'], df_name = 'ctry')


### Exploratory Data Analysis - Multivariate 
* for specific features as needed

The following plots are practice examples of possible visualisations with Seaborn which may be useful. But at the moment I'm not able to figure out what concrete uses I can put them to, nor can I come up with patterns that can be applied generically.

In [None]:
sns.set(style="darkgrid")

# Line plot with standard deviation
sns.relplot(x="AgeBin", y="Survived", kind="line", ci="sd", data=train_df);

In [None]:
# 2 variables (Sex and Pclass) with target (Survived)
sns.relplot(x="Pclass", y="Survived", hue = "Sex", kind="line", ci=None, data=train_df);
sns.catplot(x="Sex", y="Survived", hue="Pclass", kind="point", data=train_df);
sns.catplot(x="Sex", y="Survived", hue="Pclass", kind="bar", data=train_df)
sns.catplot(x="Pclass", y="Survived", hue="Sex", kind="bar", data=train_df)

In [None]:
# Plot totals not just mean
sns.catplot(x="Sex", y="Survived", kind="bar", estimator=sum, data=train_df)

# One numeric and one categorical (Sex and Age) with Target
sns.catplot(x="Survived", y="Age", hue="Sex", kind="violin", split="True", data=train_df);

# One numeric and one categorical (Fare and Pclass) with Target
g = sns.catplot(x="Fare", y="Survived", row="Pclass",
                kind="box", orient="h", height=1.5, aspect=4,
                data=train_df)
g.set(xscale="log");

In [None]:
sns.catplot(x="AgeBin", y="Survived", hue="Sex", kind="bar", data=train_df)

sns.catplot(x="SibSp", y="Survived", hue="Sex", kind="bar", data=train_df)

In [None]:
# Plot 4 variables using column facets
sns.catplot(x="AgeBin", y="Survived", hue="Sex", col = "Pclass", kind="bar", data=train_df)

In [None]:
# Plot KDE histograms with one variable and Target
fig, axis = plt.subplots(1,2,figsize=(14,6))

# Age vs Target on the left
sns.kdeplot (train_df[train_df['Survived'] == 0]['Age'], label="Died", ax=axis[0])
sns.kdeplot (train_df[train_df['Survived'] == 1]['Age'], label="Lived", ax=axis[0])

# Fare vs Target on the right
sns.kdeplot (train_df[train_df['Survived'] == 0]['Fare'], label="Died", ax=axis[1])
sns.kdeplot (train_df[train_df['Survived'] == 1]['Fare'], label="Lived", ax=axis[1])

In [None]:
# Use FacetGrid instead of calling KDEplot directly
a = sns.FacetGrid( train_df, hue = 'Survived', aspect=3 )
a.map(sns.kdeplot, 'Age', shade= True )
a.set(xlim=(0 , train_df['Age'].max()))
a.add_legend()

In [None]:
# Use FacetGrid to plot 2 variables (Sex and Pclass) vs Target
h = sns.FacetGrid(train_df, row = 'Sex', col = 'Pclass', hue = 'Survived')
h.map(plt.hist, 'Age', alpha = .75)
h.add_legend()

In [None]:
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(14,8))

sns.boxplot(x = 'Pclass', y = 'Fare', hue = 'Survived', data = train_df, ax = axis1)
axis1.set_title('Pclass vs Fare Survival Comparison')

sns.violinplot(x = 'Pclass', y = 'Age', hue = 'Survived', data = train_df, split = True, ax = axis2)
axis2.set_title('Pclass vs Age Survival Comparison')

In [None]:
e = sns.FacetGrid(train_df, col = 'Embarked')
e.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', ci=95.0, palette = 'deep')
e.add_legend()

In [None]:
sns.catplot (x="AgeBin", y="Survived", hue="Sex", row="Embarked", col = "Pclass", kind="bar", data=train_df)

In [None]:
sns.catplot (x="Sex", y="Age", hue="Survived", row="Embarked", col = "Pclass", kind="violin", data=train_df)

In [None]:
plot_pair(train_df, ['Survived', 'Pclass', 'SibSp', 'Parch', 'Age'])

## Other Functions

This [notebook](http://https://www.kaggle.com/willkoehrsen/introduction-to-manual-feature-engineering-p2) had a few more functions. Include them if they are going to be generically useful.
* Function to Convert Data Types (def convert_types(df, print_info = False)) - this will help reduce memory usage by using more efficient types for the variables. For example category is often a better type than object (unless the number of unique categories is close to the number of rows in the dataframe).
* Function to Drop Missing Columns (def remove_missing_columns(train, test, threshold = 90))
* Function to Aggregate Stats Per Client (def aggregate_client(df, group_vars, df_names)) - for parent-child data tables. In this case the child tables contained data per-loan for each user, while the parent contained data per-user. So data rolled up at the per-loan level had to be then be rolled up again at the user level and merged with the parent data.
* Function to run a ML model (def model(features, test_features, encoding = 'ohe', n_folds = 5)) - Train and test a light gradient boosting model using cross validation, and also calculating feature importances
* Function to plot Feature Importances (def plot_feature_importances(df))